In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

df = pd.read_csv('../data/data.csv')
y = df['price'] 
X = df.loc[:, df.columns != 'price']

random_state = 42

# first split to separate out the training set
X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.98,random_state = random_state)
# X_train, y_train → 98% X_other, y_other → 2%
# second split to separate out the validation and test sets 
X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,\
                    train_size = 0.5,random_state = random_state)

In [16]:
# Check NA for each column

cols = ['brokered_by', 'status', 'price', 'bed', 'bath', 
        'acre_lot', 'street', 'city', 'state', 'zip_code', 
        'house_size', 'prev_sold_date']

na_summary = pd.DataFrame({
    'Column': cols,
    'Data_Type': [df[c].dtype for c in cols],
    'Num_NA': [df[c].isna().sum() for c in cols],
    'NA_Rate': [round(df[c].isna().mean(), 3) for c in cols]
})

na_summary = na_summary.sort_values('Num_NA', ascending=False).reset_index(drop=True)

print(na_summary)


            Column Data_Type  Num_NA  NA_Rate
0   prev_sold_date    object  734297    0.330
1       house_size   float64  568484    0.255
2             bath   float64  511771    0.230
3              bed   float64  481317    0.216
4         acre_lot   float64  325589    0.146
5           street   float64   10866    0.005
6      brokered_by   float64    4533    0.002
7            price   float64    1541    0.001
8             city    object    1407    0.001
9         zip_code   float64     299    0.000
10           state    object       8    0.000
11          status    object       0    0.000


In [None]:
# Check outliers for numeric columns using IQR method

def detect_outliers_iqr(df, columns):
    outlier_summary = []
    for col in columns:
        if pd.api.types.is_numeric_dtype(df[col]):  # Check if the column is numeric
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            outliers = ((df[col] < lower) | (df[col] > upper)).sum()
            outlier_ratio = round(outliers / len(df), 3)
            col_min = df[col].min()
            col_max = df[col].max()
            
            outlier_summary.append([
                col,
                df[col].dtype,
                round(df[col].min(), 2),
                round(df[col].max(), 2),
                round(Q1, 2),
                round(Q3, 2),
                round(lower, 2),
                round(upper, 2),
                outliers,
                outlier_ratio
            ])
    
    outlier_table = pd.DataFrame(outlier_summary, columns=[
        'Column', 'Data_Type', 'Min', 'Max', 'Q1', 'Q3', 
        'Lower_Bound', 'Upper_Bound', 'Num_Outliers', 'Outlier_Rate'
    ])
    
    return outlier_table.sort_values('Outlier_Rate', ascending=False).reset_index(drop=True)

numeric_cols = ['price', 'bed', 'bath', 'acre_lot', 'house_size']
outlier_table = detect_outliers_iqr(df, numeric_cols)
print(outlier_table)


       Column Data_Type  Min           Max         Q1         Q3  Lower_Bound  \
0    acre_lot   float64  0.0  1.000000e+05       0.15       0.98         -1.1   
1       price   float64  0.0  2.147484e+09  165000.00  550000.00    -412500.0   
2         bed   float64  1.0  4.730000e+02       3.00       4.00          1.5   
3        bath   float64  1.0  8.300000e+02       2.00       3.00          0.5   
4  house_size   float64  4.0  1.040400e+09    1300.00    2413.00       -369.5   

   Upper_Bound  Num_Outliers  Outlier_Rate  
0         2.22        292418         0.131  
1   1127500.00        171600         0.077  
2         5.50        118920         0.053  
3         4.50         79075         0.036  
4      4082.50         77850         0.035  


In [24]:
# Categorical feature encoding with OneHotEncoder
X_train['city'] = X_train['city'].fillna('Unknown')
X_train['state'] = X_train['state'].fillna('Unknown')

enc = OneHotEncoder(sparse_output=True, handle_unknown='ignore') # use sparse matrix because there are too many categories

ftrs = ['city', 'state','status']

enc.fit(X_train[['city', 'state','status']])


print('categories:',enc.categories_)
print('feature names:',enc.get_feature_names_out(ftrs))

# transform X_train
X_train_ohe = enc.transform(X_train[['city', 'state','status']])
#print(X_train_ohe)
# do all of this in one step
X_train_ohe = enc.fit_transform(X_train[['city', 'state','status']])
print('X_train transformed')
print(X_train_ohe)

# transform X_test
X_test_ohe = enc.transform(X_test[['city', 'state','status']])
print('X_test transformed')
print(X_test_ohe)


categories: [array(['100 89 Lower Shepard Creek Road', '139th Ave Unit Peck',
       '15th Ave Milton', ..., 'Zuni', 'Zwingle', 'Zwolle'],
      shape=(20045,), dtype=object), array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
       'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Brunswick', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Unknown',
       'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object), array(['f

In [None]:
# Categorical feature 后续可能需要继续处理

# Handle missing values

# brokered_by
# NA means no broker → fill with "Unknown Broker"
# 我感觉直接分成有没有agent更好，变成dummy
df['brokered_by'] = df['brokered_by'].fillna('Unknown Broker')

# zip_code
# NA means unknown zip → fill with "Unknown Zip"
# 需要救援
df['zip_code'] = df['zip_code'].fillna('Unknown Zip')

# street
# NA means unknown street → fill with "Unknown Street"
# 我建议drop,因为有很多地理信息可以定位到房子的位置了
df['street'] = df['street'].fillna('Unknown Street')

# prev_sold_date
# NA means haven't sold → fill with "Never Sold"
# 需要救援
df['prev_sold_date'] = df['prev_sold_date'].fillna('Never Sold')

In [26]:
# continuous feature scaling with StandardScaler

# 处理outliers需要进一步的去问，因为不知道这些值是不是错误数据

X_train_countinuous = X_train[['bed', 'bath', 'acre_lot', 'house_size']]
X_test_countinuous = X_test[['bed', 'bath', 'acre_lot', 'house_size']]
scaler = StandardScaler()
print(scaler.fit_transform(X_train_countinuous))
print(scaler.transform(X_test_countinuous))



[[-0.17577097 -0.30036533         nan -0.00164542]
 [        nan         nan -0.01917359         nan]
 [        nan         nan -0.01834116         nan]
 ...
 [-1.45073138 -0.90534897         nan -0.00217827]
 [-0.81325117 -0.30036533 -0.01979461 -0.0016393 ]
 [-0.17577097 -0.30036533 -0.01974176 -0.00160255]]
[[-1.75770971e-01 -3.00365330e-01 -8.03485668e-03             nan]
 [-1.75770971e-01 -3.00365330e-01 -1.97946116e-02 -1.13829816e-03]
 [ 4.61709232e-01             nan             nan -2.30620848e-04]
 ...
 [-1.75770971e-01  3.04618308e-01 -1.86054229e-02 -6.56898451e-04]
 [ 1.09918943e+00  9.09601946e-01             nan -5.85852184e-04]
 [            nan             nan -1.96492664e-02             nan]]
