In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

df = pd.read_csv('../data/data.csv')
print(df.shape)
df = df.dropna(subset=['state']) #罪过 最后要悄悄删掉
df = df.dropna(subset=['price'])
print(df.shape)
y = df['price']
X = df.loc[:, df.columns != 'price']
# Counting occurrences of each state
state_counts = X['state'].value_counts()

# Set common and rare states
common_states = state_counts[state_counts >= 2].index
rare_states = state_counts[state_counts < 2].index

# splitting the dataset
mask_common = X['state'].isin(common_states)
mask_rare = X['state'].isin(rare_states)

X_common, y_common = X[mask_common], y[mask_common]
X_rare, y_rare = X[mask_rare], y[mask_rare]

random_state = 42

X_train, X_other, y_train, y_other = train_test_split(
    X_common, y_common, train_size=0.98, stratify=X_common['state'], random_state=random_state
)

X_val, X_test, y_val, y_test = train_test_split(
    X_other, y_other, train_size=0.5, random_state=random_state
)

X_train = pd.concat([X_train, X_rare])
y_train = pd.concat([y_train, y_rare])

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


: 

In [54]:
# Check NA for each column

cols = ['brokered_by', 'status', 'price', 'bed', 'bath', 
        'acre_lot', 'street', 'city', 'state', 'zip_code', 
        'house_size', 'prev_sold_date']

na_summary = pd.DataFrame({
    'Column': cols,
    'Data_Type': [df[c].dtype for c in cols],
    'Num_NA': [df[c].isna().sum() for c in cols],
    'NA_Rate': [round(df[c].isna().mean(), 3) for c in cols]
})

na_summary = na_summary.sort_values('Num_NA', ascending=False).reset_index(drop=True)

print(na_summary)


            Column Data_Type  Num_NA  NA_Rate
0   prev_sold_date    object  733248    0.330
1       house_size   float64  567872    0.255
2             bath   float64  510984    0.230
3              bed   float64  480859    0.216
4         acre_lot   float64  325134    0.146
5           street   float64   10864    0.005
6      brokered_by   float64    4533    0.002
7             city    object    1404    0.001
8         zip_code   float64     296    0.000
9           status    object       0    0.000
10           price   float64       0    0.000
11           state    object       0    0.000


In [55]:
# Check outliers for numeric columns using IQR method

def detect_outliers_iqr(df, columns):
    outlier_summary = []
    for col in columns:
        if pd.api.types.is_numeric_dtype(df[col]):  # Check if the column is numeric
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            outliers = ((df[col] < lower) | (df[col] > upper)).sum()
            outlier_ratio = round(outliers / len(df), 3)
            col_min = df[col].min()
            col_max = df[col].max()
            
            outlier_summary.append([
                col,
                df[col].dtype,
                round(df[col].min(), 2),
                round(df[col].max(), 2),
                round(Q1, 2),
                round(Q3, 2),
                round(lower, 2),
                round(upper, 2),
                outliers,
                outlier_ratio
            ])
    
    outlier_table = pd.DataFrame(outlier_summary, columns=[
        'Column', 'Data_Type', 'Min', 'Max', 'Q1', 'Q3', 
        'Lower_Bound', 'Upper_Bound', 'Num_Outliers', 'Outlier_Rate'
    ])
    
    return outlier_table.sort_values('Outlier_Rate', ascending=False).reset_index(drop=True)

numeric_cols = ['price', 'bed', 'bath', 'acre_lot', 'house_size']
outlier_table = detect_outliers_iqr(df, numeric_cols)
print(outlier_table)


       Column Data_Type  Min           Max         Q1         Q3  Lower_Bound  \
0    acre_lot   float64  0.0  1.000000e+05       0.15       0.98         -1.1   
1       price   float64  0.0  2.147484e+09  165000.00  550000.00    -412500.0   
2         bed   float64  1.0  4.730000e+02       3.00       4.00          1.5   
3        bath   float64  1.0  8.300000e+02       2.00       3.00          0.5   
4  house_size   float64  4.0  1.040400e+09    1300.00    2413.00       -369.5   

   Upper_Bound  Num_Outliers  Outlier_Rate  
0         2.22        292018         0.131  
1   1127500.00        171600         0.077  
2         5.50        118888         0.053  
3         4.50         79063         0.036  
4      4082.50         77831         0.035  


In [None]:
# Preprocessing Pipeline
#-------------------------------------------


# preprocess with pipeline and columntransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime



# collect the various features
cat_ftrs = ['status','city','state']
num_ftrs = ['bed','bath','acre_lot','house_size']
broker_ftr = ['brokered_by']
street_ftr = ['street']
zip_ftr = ['zip_code']
sold_ftr = ['prev_sold_date']

random_state = 42

class BinaryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # NA → 0；非 NA → 1
        return np.where(pd.isna(X), 0, 1).reshape(-1,1)

class ZipCodeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_filled = X.fillna("000")
        return X_filled.values.reshape(-1,1)
    
class PrevSoldDateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        dates = pd.to_datetime(X, errors='coerce')
        years = (datetime.now() - dates).dt.days / 365
        years = years.fillna(-1)
        return years.values.reshape(-1,1)



# one-hot encoder and imputer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])
# standard scaler and imputer
numeric_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),
    ('scaler', StandardScaler())])

broker_pipe = Pipeline(steps=[
    ('binary', BinaryTransformer())
])
street_pipe = Pipeline(steps=[
    ('binary', BinaryTransformer())
])
zip_pipe = Pipeline(steps=[
    ('zip_transform', ZipCodeTransformer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
sold_pipe = Pipeline(steps=[
    ('sold_transform', PrevSoldDateTransformer()),
    ('scaler', StandardScaler())
])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('broker', broker_pipe, broker_ftr),
        ('street', street_pipe, street_ftr),
        ('zip', zip_pipe, zip_ftr),
        ('sold', sold_pipe, sold_ftr)
    ],
    remainder='drop'
)



categories: [array(['100 89 Lower Shepard Creek Road', '139th Ave Unit Peck',
       '15th Ave Milton', ..., 'Zuni', 'Zwingle', 'Zwolle'],
      shape=(20045,), dtype=object), array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
       'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Brunswick', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object), array(['for_sale', '

In [None]:
# Categorical feature 后续可能需要继续处理

# Handle missing values

# brokered_by
# NA means no broker → fill with 0； otherwise 1
# 我感觉直接分成有没有agent更好，变成dummy
X_train['brokered_by'] = X_train['brokered_by'].fillna('Unknown Broker')#需要修改

# zip_code 000 for NA
X_train['zip_code'] = X_train['zip_code'].fillna('Unknown Zip') #需要修改

# street
# NA means unknown street → fill with 0; otherwise 1
X_train['street'] = X_train['street'].fillna('Unknown Street') #需要修改

# prev_sold_date
# NA means haven't sold → fill with -1; otherwise calculate years since last sold   
from datetime import datetime
# 将日期字符串转为 datetime
X_train['prev_sold_date'] = pd.to_datetime(X_train['prev_sold_date'], errors='coerce')
# 计算距今年份差
X_train['years_since_last_sold'] = (datetime.now() - X_train['prev_sold_date']).dt.days / 365
# 如果缺失，表示从未售出
X_train['years_since_last_sold'] = X_train['years_since_last_sold'].fillna(-1)


In [58]:
# continuous feature scaling with StandardScaler

# 处理outliers需要进一步的去问，因为不知道这些值是不是错误数据

X_train_countinuous = X_train[['bed', 'bath', 'acre_lot', 'house_size']]
X_test_countinuous = X_test[['bed', 'bath', 'acre_lot', 'house_size']]
scaler = StandardScaler()
print(scaler.fit_transform(X_train_countinuous))
print(scaler.transform(X_test_countinuous))



[[ 1.10016716e+00  3.04370068e-01 -1.93011668e-02  2.10544673e-03]
 [ 4.62158251e-01  3.04370068e-01             nan -1.33937269e-03]
 [            nan             nan  9.06133346e-04             nan]
 ...
 [ 4.62158251e-01  9.08870851e-01 -1.95090876e-02  9.83159001e-05]
 [ 4.62158251e-01  3.04370068e-01 -1.94311173e-02 -5.87464006e-04]
 [            nan             nan -1.97559935e-02             nan]]
[[ 4.62158251e-01  3.04370068e-01 -1.92621816e-02 -6.21076137e-05]
 [-1.75850653e-01 -3.00130714e-01 -1.92361915e-02             nan]
 [ 4.62158251e-01  3.04370068e-01 -1.97429984e-02 -1.32467741e-03]
 ...
 [ 4.62158251e-01 -3.00130714e-01 -1.95480727e-02 -8.91166536e-04]
 [-1.75850653e-01 -3.00130714e-01 -1.96780232e-02 -1.57204801e-03]
 [            nan             nan  2.91995216e-01             nan]]


In [59]:
from datetime import datetime
# 将日期字符串转为 datetime
X_train['prev_sold_date'] = pd.to_datetime(X_train['prev_sold_date'], errors='coerce')
# 计算距今年份差
X_train['years_since_last_sold'] = (datetime.now() - X_train['prev_sold_date']).dt.days / 365
# 如果缺失，表示从未售出
X_train['years_since_last_sold'] = X_train['years_since_last_sold'].fillna(-1)


