In [2]:
import numpy as np
import pandas as pd

# Data preprocessing

### Read data and save to vector

In [3]:
from sklearn.preprocessing import StandardScaler

X_train_df = pd.read_csv('./data/X_train.csv', skiprows=1, header=None)
y_train_df = pd.read_csv('./data/y_train.csv', skiprows=1, header=None)
X_test_df = pd.read_csv('./data/X_test.csv', skiprows=1, header=None)

X_train_full = X_train_df.values[:, 1:]
y_train_full = y_train_df.values[:, 1:]
X_test = X_test_df.values[:, 1:]

print(X_train_full.shape, y_train_full.shape, X_test.shape)

(1212, 832) (1212, 1) (776, 832)


### Imputing NaN

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
X_train_imputed = mean_imputer.fit_transform(X_train_full)

### Feature selection

In [5]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_train_imputed, y_train_full.ravel())

p_values = selector.pvalues_
scores = selector.scores_

significant_feature_indices = np.where(p_values < 5e-2)[0]

print(f"Number of significant features: {len(significant_feature_indices)}")

Number of significant features: 245


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


In [6]:
# Extract selected features from the original data
X_train_selected = X_train_full[:, significant_feature_indices]
X_test_selected = X_test[:, significant_feature_indices]

### Final NaN imputation

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Impute missing values over the selected features
iterative_imputer = IterativeImputer()
X_train_selected_imputed = iterative_imputer.fit_transform(X_train_selected)
X_test_selected_imputed = iterative_imputer.transform(X_test_selected)

### Outlier detection?

In [19]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.01)
outlier_pred = iso_forest.fit_predict(X_train_selected_imputed)

# Keep only inliers
inlier_mask = outlier_pred != -1
X_train_outliers_removed = X_train_selected_imputed[inlier_mask]
y_train_outliers_removed = y_train_full[inlier_mask]

### Scale features

In [20]:
from sklearn.preprocessing import StandardScaler

## Scale features, fit_transform on train set, scale other sets accordingly 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected_imputed)
X_test_scaled = scaler.transform(X_test_selected_imputed)

# Training set

### Linear

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 

# Train the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_scaled, y_train_full.ravel())

# Predict on the validation set
y_pred_lr = linear_regressor.predict(X_train_scaled)
mse_full_lr = r2_score(y_train_full, y_pred_lr)
print(f"Linear Regression r2 score: {mse_full_lr}")

Linear Regression r2 score: 0.5073822126403644


### Ridge

In [22]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score

# Define a range of alpha values to test
alphas = np.logspace(-3, 3, 20)

# Train the Ridge Regression model with cross-validation
ridge_regressor = RidgeCV(alphas=alphas, cv=5)
ridge_regressor.fit(X_train_scaled, y_train_full.ravel())

# Predict on the validation set
y_pred_ridge = ridge_regressor.predict(X_train_scaled)
mse_full_ridge = r2_score(y_train_full, y_pred_ridge)
print(f"Ridge Regression R2 score: {mse_full_ridge}")

Ridge Regression R2 score: 0.3904320459904407


### Lasso

In [23]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score 

# Train lasso
lasso = LassoCV(cv=5, max_iter=10000)
lasso.fit(X_train_scaled, y_train_full.ravel())

# Predict valudation set, and full train set
y_pred = lasso.predict(X_train_scaled)
mse_full = r2_score(y_train_full, y_pred)
print(f"Lasso R2 score: {mse_full}")

Lasso R2 score: 0.23843736056118037


In [24]:
# Determine the features that lasso is actually using
coefficients = lasso.coef_
non_zero_coefs = np.sum(coefficients != 0)
print(f"Number of features selected by Lasso: {non_zero_coefs}")

Number of features selected by Lasso: 43


In [25]:
# Take only those features
X_train_lasso_selected = X_train_scaled[:, coefficients != 0]
X_test_lasso_selected = X_test_scaled[:, coefficients != 0]

# Retrain the model to the features
lasso_final = LassoCV(cv=5, random_state=42)
lasso_final.fit(X_train_lasso_selected, y_train_full.ravel())

# Predict again (for the full set, this usually gives worse MSE)
y_full_pred_final = lasso_final.predict(X_train_lasso_selected)
mse_final_full = r2_score(y_train_full, y_full_pred_final)
print(f"Final Lasso r2 score: {mse_final_full}")

Final Lasso r2 score: 0.30383685970355145


# Export to csv

In [1]:
# Exporting to required format for Kaggle
y_test_pred = linear_regressor.predict(X_test_scaled)

table = pd.DataFrame({'id': np.arange(0, y_test_pred.shape[0]), 'y': y_test_pred.flatten()})
print(table.shape)
table.to_csv('./data/predictions/y_test_pred.csv', index=False)

NameError: name 'linear_regressor' is not defined