In [119]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.model_selection import cross_val_score

In [120]:
df_train = pd.read_csv('../../data/prices_train.csv')
df_test = pd.read_csv('../../data/prices_test.csv')

In [121]:
def fill_missing_by_loc(df, col):
    # if 
    if df[col].isna().any():
        feats = ['X5 latitude', 'X6 longitude']
        valid_rows = df[feats + [col]].dropna()
        
        if len(valid_rows) > 5:
            imputer = KNNImputer(n_neighbors=5)
            df_impute = pd.DataFrame(imputer.fit_transform(
                df[feats + [col]]
            ), columns=feats + [col])
            df[col] = df_impute[col]
        else:
            df[col].fillna(df[col].median(), inplace=True)
    return df

In [122]:
df_train = fill_missing_by_loc(df_train, 'X3 distance to the nearest MRT station')
df_train = fill_missing_by_loc(df_train, 'X4 number of convenience stores')
df_test = fill_missing_by_loc(df_test, 'X3 distance to the nearest MRT station')
df_test = fill_missing_by_loc(df_test, 'X4 number of convenience stores')

In [123]:
def feat_add(df):
    lat_center = 24.9685
    lon_center = 121.5325
    df['dist_to_center'] = np.sqrt((df['X5 latitude'] - lat_center)**2 + 
                                  (df['X6 longitude'] - lon_center)**2)
        
    df['manhattan_dist'] = np.abs(df['X5 latitude'] - lat_center) + np.abs(df['X6 longitude'] - lon_center)
    
    df['weighted_mrt_dist'] = df['X3 distance to the nearest MRT station'] / (df['X2 house age'] + 1)
    
    df['store_density'] = df['X4 number of convenience stores'] / (df['dist_to_center'] + 0.01)
    
    df['log_mrt_dist'] = np.log1p(df['X3 distance to the nearest MRT station'])
    df['exp_mrt_dist'] = 1 - np.exp(-df['X3 distance to the nearest MRT station']/5000)
    
    from sklearn.cluster import KMeans
    if 'district_cluster' not in df.columns:
        coords = df[['X5 latitude', 'X6 longitude']].values
        kmeans = KMeans(n_clusters=5, random_state=42)
        df['district_cluster'] = kmeans.fit_predict(coords)
    
    df['age_x_stores'] = df['X2 house age'] * df['X4 number of convenience stores']
    df['age_x_mrt'] = df['X2 house age'] * df['log_mrt_dist']
    df['stores_x_mrt'] = df['X4 number of convenience stores'] * df['exp_mrt_dist']
    
    df['year_sin'] = np.sin(2 * np.pi * (df['X1 transaction date'] % 1))
    df['year_cos'] = np.cos(2 * np.pi * (df['X1 transaction date'] % 1))
    
    return df

In [124]:
df_train = feat_add(df_train)
df_test = feat_add(df_test)

In [125]:
def outliers_find(df):
    num_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in num_cols:
        if col not in ['X5 latitude', 'X6 longitude', 'district_cluster']: 
            z_scores = stats.zscore(df[col], nan_policy='omit')
            # Z_i = (X_i - u) / delta
            abs_z_scores = np.abs(z_scores)
            filtered_entries = (abs_z_scores < 3) | np.isnan(z_scores)
            df[col] = np.where(filtered_entries, df[col], df[col].median())
    
    return df

In [126]:
df_train = outliers_find(df_train)

In [127]:
feats = [
        'X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
    'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
    'dist_to_center', 'manhattan_dist', 'weighted_mrt_dist', 'store_density',
    'log_mrt_dist', 'exp_mrt_dist', 'district_cluster', 
    'age_x_stores', 'age_x_mrt', 'stores_x_mrt', 'year_sin', 'year_cos'
]

In [128]:
X_train = df_train[feats]
y_train = df_train['Y house price of unit area']
X_test = df_test[feats]

for col in X_train.columns:
    if X_train[col].isna().any() or X_test[col].isna().any():
        median_val = X_train[col].median()
        X_train[col] = X_train[col].fillna(median_val)
        X_test[col] = X_test[col].fillna(median_val)

print(X_train.isna().sum())
print(X_test.isna().sum())

X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
dist_to_center                            0
manhattan_dist                            0
weighted_mrt_dist                         0
store_density                             0
log_mrt_dist                              0
exp_mrt_dist                              0
district_cluster                          0
age_x_stores                              0
age_x_mrt                                 0
stores_x_mrt                              0
year_sin                                  0
year_cos                                  0
dtype: int64
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna(median_val)
A value is trying to be set on a copy of a slice from a DataFrame.

In [129]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [130]:
lasso = Lasso(alpha=0.01, max_iter=10000)

pipe = Pipeline([
    # ('poly', PolynomialFeatures(degree = 1, include_bias=False, interaction_only=True)),
    ('model', lasso)
])

pipe.fit(X_train_scaled, y_train)
predictions = pipe.predict(X_test_scaled)

mse = -np.mean(cross_val_score(pipe, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error'))
print(mse)


submission = pd.DataFrame({
    'index': df_test.index,
    'Y house price of unit area': predictions
})
submission.to_csv('submission.csv', index=False)


48.41483579858552
