In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
x_train_original = pd.read_csv('original_data/X_train.csv', index_col='id')
y_train_original = pd.read_csv('original_data/y_train.csv', index_col='id')
x_test_original = pd.read_csv('original_data/X_test.csv', index_col='id')

In [63]:
x_test = x_test_original.values[:, 1:]

In [3]:
# Scale the data
# scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(x_train_original)

# The problem of doing it here is that then I screw the low variance th

In [4]:
# Fill with mean values all the missing values
mean_input = SimpleImputer(strategy='mean')
x_train_mean = mean_input.fit_transform(x_train_original)

In [5]:
x_train_mean.shape

(1212, 832)

In [6]:
# remove low variance features (not give any relevant info)
threshold = 0.05 # 0.01 is more conservative
var_selector = VarianceThreshold(threshold)
x_train_var = var_selector.fit_transform(x_train_mean)

In [7]:
x_train_var.shape # Already removed ~110 features

(1212, 720)

In [8]:
# Remove highly correlated features
corr_matrix = np.corrcoef(x_train_var, rowvar=False)
high_corr_var = np.where(np.abs(corr_matrix)>0.9)
correlated_features = set()
for i,j in zip(*high_corr_var):
    if i != j and i not in correlated_features:
        correlated_features.add(j)
        
x_train_uncorrelated = np.delete(x_train_var, list(correlated_features), axis=1)

In [9]:
# Get rid of additional 34 features
x_train_uncorrelated.shape

(1212, 686)

In [10]:
y_train_flat = np.ravel(y_train_original)
y_train_flat

array([74., 51., 70., ..., 68., 71., 53.])

In [11]:
# Find the 200 best features through kbest
k_best = 300
selector = SelectKBest(score_func=mutual_info_classif, k=k_best) # k_best
X_kbest = selector.fit_transform(x_train_uncorrelated, y_train_flat)

In [None]:
k_best_idxs = selector.get_support(indices=True)
x_test = x_test[:, k_best_idxs]

In [12]:
X_kbest.shape

(1212, 300)

In [67]:
# Standardize the data and remove outliers

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_kbest)
x_test = scaler.transform(x_test)

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
outlier_labels = lof.fit_predict(x_train_scaled)

x_train = x_train_scaled[outlier_labels == 1] # original data after detecting
y_train = y_train_flat[outlier_labels == 1]

In [29]:
y_train.shape

(1151,)

In [30]:
# Use Random Forest to select features

model = RandomForestRegressor()
model.fit(x_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[-100:]

x_selected = x_train[:, indices]

In [68]:
x_test = x_test[:, indices]

In [31]:
print(x_selected.shape)

(1151, 100)


In [36]:
# Train val split
np.random.seed(32)

def split_indices(n, val_pct):
    n_val = int(val_pct*n)
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

train_idxs, val_idxs = split_indices(len(x_selected), 0.2)

val_x = x_selected[val_idxs, :]
val_y = y_train[val_idxs]
train_x = x_selected[train_idxs, :]
train_y = y_train[train_idxs]

In [37]:
# Train ridge regression
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(train_x, train_y)

In [39]:
y_train_pred = ridge_model.predict(train_x)
y_val_pred = ridge_model.predict(val_x)

print("Train R^2 Score:", r2_score(train_y, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y, y_val_pred))

Train R^2 Score: 0.48298701304377545
Validation R^2 Score: 0.3868440511639396


In [43]:
## Linear Regression
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(train_x, train_y)
y_train_pred = linear_model.predict(train_x)
y_val_pred = linear_model.predict(val_x)

print("Train R^2 Score:", r2_score(train_y, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y, y_val_pred))

Train R^2 Score: 0.48299128869737606
Validation R^2 Score: 0.3858678081352859


In [44]:
## Lasso
from sklearn.linear_model import Lasso

lasso_model = Lasso()
lasso_model.fit(train_x, train_y)
y_train_pred = lasso_model.predict(train_x)
y_val_pred = lasso_model.predict(val_x)

print("Train R^2 Score:", r2_score(train_y, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y, y_val_pred))

Train R^2 Score: 0.3336872419903135
Validation R^2 Score: 0.3427621632141865


In [47]:
## XG Boost
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05, 
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1, 
    reg_alpha=0.5, 
    reg_lambda=1.2,
    random_state=42
)

xgb_model.fit(train_x, train_y)

y_train_pred = xgb_model.predict(train_x)
y_val_pred = xgb_model.predict(val_x)
#y_test_pred = xgb_model.predict(X_test_scaled)

print("Train R^2 Score:", r2_score(train_y, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y, y_val_pred))

Train R^2 Score: 0.9996112941943264
Validation R^2 Score: 0.5991541551729437


In [70]:
y_test_pred = xgb_model.predict(x_test)
y_test_pred

array([72.93002 , 70.236465, 71.47746 , 69.948456, 72.24022 , 70.75084 ,
       70.68345 , 70.41909 , 70.31608 , 69.36794 , 71.253334, 69.81118 ,
       68.435234, 70.86535 , 70.52215 , 70.528366, 70.93614 , 70.71203 ,
       71.663666, 69.086205, 69.17634 , 72.21663 , 70.35669 , 71.005035,
       71.383606, 70.36817 , 70.77453 , 72.04664 , 70.779854, 70.45461 ,
       71.911095, 68.32331 , 71.16872 , 72.06403 , 70.23565 , 68.787445,
       69.92719 , 70.88187 , 68.342804, 72.854454, 70.72548 , 69.38992 ,
       68.396736, 69.84309 , 70.51633 , 69.43733 , 70.500946, 71.31786 ,
       71.04802 , 70.54862 , 71.66242 , 71.41694 , 71.10055 , 69.73508 ,
       70.30756 , 70.88227 , 71.657425, 70.69363 , 68.478745, 70.56824 ,
       69.85274 , 70.59699 , 72.23364 , 69.29117 , 71.480736, 71.746956,
       72.755295, 71.42104 , 69.79533 , 71.30703 , 69.69487 , 69.982635,
       69.60346 , 71.48511 , 70.31946 , 71.23822 , 68.89271 , 73.382576,
       68.66797 , 69.86446 , 71.05028 , 70.70431 , 

In [58]:
x_test = mean_input.transform(x_test_original)

In [59]:
x_test = x_test[:, k_best_idxs]
x_test = scaler.transform(x_test)
x_test = x_test[:, indices]

In [60]:
y_test = xgb_model.predict(x_test)

In [61]:
table = pd.DataFrame({'id': np.arange(0, y_test.shape[0]), 'y': y_test.ravel()})
print(table.shape)
table.to_csv('./data/predictions/y_test_xg_jorge.csv', index=False)

(776, 2)


In [62]:
y_val_pred

array([73.88952 , 76.19964 , 57.62051 , 78.27227 , 72.29454 , 62.92834 ,
       81.37014 , 84.09891 , 60.94775 , 75.20512 , 54.169582, 79.84534 ,
       70.99375 , 70.00009 , 68.46336 , 72.38428 , 65.147606, 69.994865,
       74.98604 , 77.25656 , 64.594284, 65.95398 , 63.333652, 61.345566,
       57.963417, 68.72357 , 82.36694 , 61.31138 , 77.47098 , 64.95296 ,
       81.17483 , 81.24967 , 61.18805 , 69.40163 , 73.27913 , 75.52132 ,
       73.58433 , 73.5349  , 75.940025, 82.24471 , 68.62249 , 74.59709 ,
       66.23163 , 70.35884 , 62.45373 , 81.17966 , 75.06461 , 68.37014 ,
       61.076496, 72.417725, 69.00953 , 56.82814 , 74.86942 , 80.49153 ,
       74.42646 , 67.97181 , 70.440895, 62.604916, 55.93757 , 70.74763 ,
       81.588326, 73.870605, 63.128082, 64.98296 , 54.443134, 66.869545,
       57.737434, 64.38459 , 67.00488 , 70.14473 , 69.720695, 63.705505,
       77.16641 , 60.633343, 76.97274 , 54.125286, 69.627525, 62.95935 ,
       56.62651 , 76.417404, 63.50196 , 69.767914, 