In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
x_train_original = pd.read_csv('original_data/X_train.csv', index_col='id')
y_train_original = pd.read_csv('original_data/y_train.csv', index_col='id')
x_test_original = pd.read_csv('original_data/X_test.csv', index_col='id')

In [3]:
# Scale the data
# scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(x_train_original)

# The problem of doing it here is that then I screw the low variance th

In [4]:
# Fill with mean values all the missing values
mean_input = SimpleImputer(strategy='mean')
x_train_mean = mean_input.fit_transform(x_train_original)

In [5]:
x_train_mean.shape

(1212, 832)

In [6]:
# remove low variance features (not give any relevant info)
threshold = 0.05 # 0.01 is more conservative
var_selector = VarianceThreshold(threshold)
x_train_var = var_selector.fit_transform(x_train_mean)

In [7]:
x_train_var.shape # Already removed ~110 features

(1212, 720)

In [8]:
# Remove highly correlated features
corr_matrix = np.corrcoef(x_train_var, rowvar=False)
high_corr_var = np.where(np.abs(corr_matrix)>0.9)
correlated_features = set()
for i,j in zip(*high_corr_var):
    if i != j and i not in correlated_features:
        correlated_features.add(j)
        
x_train_uncorrelated = np.delete(x_train_var, list(correlated_features), axis=1)

In [9]:
# Get rid of additional 34 features
x_train_uncorrelated.shape

(1212, 686)

In [10]:
y_train_flat = np.ravel(y_train_original)
y_train_flat

array([74., 51., 70., ..., 68., 71., 53.])

In [11]:
# Find the 200 best features through kbest
k_best = 300
selector = SelectKBest(score_func=mutual_info_classif, k=k_best) # k_best
X_kbest = selector.fit_transform(x_train_uncorrelated, y_train_flat)

In [12]:
X_kbest.shape

(1212, 300)

In [28]:
# Standardize the data and remove outliers

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_kbest)

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
outlier_labels = lof.fit_predict(x_train_scaled)

x_train = x_train_scaled[outlier_labels == 1] # original data after detecting
y_train = y_train_flat[outlier_labels == 1]

In [29]:
y_train.shape

(1151,)

In [30]:
# Use Random Forest to select features

model = RandomForestRegressor()
model.fit(x_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[-100:]

x_selected = x_train[:, indices]
#y_selected = y_train[:, indices]

In [31]:
print(x_selected.shape)

(1151, 100)


In [33]:
k_best_idxs = selector.get_support(indices=True)

In [36]:
# Train val split
np.random.seed(32)

def split_indices(n, val_pct):
    n_val = int(val_pct*n)
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

train_idxs, val_idxs = split_indices(len(x_selected), 0.2)

val_x = x_selected[val_idxs, :]
val_y = y_train[val_idxs]
train_x = x_selected[train_idxs, :]
train_y = y_train[train_idxs]

In [37]:
# Train ridge regression
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(train_x, train_y)

In [39]:
y_train_pred = ridge_model.predict(train_x)
y_val_pred = ridge_model.predict(val_x)

print("Train R^2 Score:", r2_score(train_y, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y, y_val_pred))

Train R^2 Score: 0.48298701304377545
Validation R^2 Score: 0.3868440511639396


In [40]:
# Try with a widther feature space
ridge_2 = Ridge()
val_x_2 = x_train[val_idxs, :]
val_y_2 = y_train[val_idxs]
train_x_2 = x_train[train_idxs, :]
train_y_2 = y_train[train_idxs]

In [42]:
ridge_2.fit(train_x_2, train_y_2)
y_val_pred = ridge_2.predict(val_x_2)
y_train_pred = ridge_2.predict(train_x_2)

print("Train R^2 Score:", r2_score(train_y_2, y_train_pred))
print("Validation R^2 Score:", r2_score(val_y_2, y_val_pred))

Train R^2 Score: 0.5915374305678198
Validation R^2 Score: 0.20171966918381623
