In [78]:
import numpy as np
import pandas as pd

# Data preprocessing

### Read data and save to vector

In [79]:
from sklearn.preprocessing import StandardScaler

X_train_df = pd.read_csv('./data/X_train.csv', skiprows=1, header=None)
y_train_df = pd.read_csv('./data/y_train.csv', skiprows=1, header=None)
X_test_df = pd.read_csv('./data/X_test.csv', skiprows=1, header=None)

X_train_full = X_train_df.values[:, 1:]
y_train_full = y_train_df.values[:, 1:]
X_test = X_test_df.values[:, 1:]

print(X_train_full.shape, y_train_full.shape, X_test.shape)

(1212, 832) (1212, 1) (776, 832)


### Train and validation set
Used a 70-30 split, can be changed at `testSize=0.3`

In [80]:
from sklearn.model_selection import train_test_split

testSize = 0.3
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size = testSize)

### Imputing NaN

In [34]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
X_train_imputed = mean_imputer.fit_transform(X_train)

### Feature selection

In [48]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_train_imputed, y_train.ravel())

p_values = selector.pvalues_
scores = selector.scores_

significant_feature_indices = np.where(p_values < 5e-2)[0]

print(f"Number of significant features: {len(significant_feature_indices)}")

Number of significant features: 97


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


In [49]:
# Extract selected features from the original data
X_selected = X_train_full[:, significant_feature_indices]
X_train_selected = X_train[:, significant_feature_indices]
X_val_selected = X_val[:, significant_feature_indices]
X_test_selected = X_test[:, significant_feature_indices]

### Final NaN imputation

In [50]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Impute missing values using the selected features
iterative_imputer = IterativeImputer()
X_train_selected_imputed = iterative_imputer.fit_transform(X_train_selected)
X_selected_imputed = iterative_imputer.transform(X_selected)
X_val_selected_imputed = iterative_imputer.transform(X_val_selected)
X_test_selected_imputed = iterative_imputer.transform(X_test_selected)



### Outlier detection?

In [76]:
# TODO

### Scale features

In [73]:
from sklearn.preprocessing import StandardScaler

## Scale features, fit_transform on train set, scale other sets accordingly
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_outliers_removed)
X_val_scaled = scaler.transform(X_val_selected_imputed)
X_scaled = scaler.transform(X_selected_imputed)
X_test_scaled = scaler.transform(X_test_selected_imputed)

# Training set

### Linear

In [74]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_scaled, y_train.ravel())

# Predict on the validation set
y_val_pred_lr = linear_regressor.predict(X_val_scaled)
y_pred_lr = linear_regressor.predict(X_scaled)
mse_lr = mean_squared_error(y_val, y_val_pred_lr)
mse_full_lr = mean_squared_error(y_train_full, y_pred_lr)
print(f"Linear Regression Validation Mean Squared Error: {mse_lr}")
print(f"Linear Regression Full lMean Squared Error: {mse_full_lr}")

ValueError: Found input variables with inconsistent numbers of samples: [839, 848]

### Ridge

In [77]:
from sklearn.linear_model import RidgeCV

# Define a range of alpha values to test
alphas = np.logspace(-3, 3, 20)

# Train the Ridge Regression model with cross-validation
ridge_regressor = RidgeCV(alphas=alphas, cv=5)
ridge_regressor.fit(X_train_scaled, y_train.ravel())

# Predict on the validation set
y_val_pred_ridge = ridge_regressor.predict(X_val_scaled)
y_pred_ridge = ridge_regressor.predict(X_scaled)
mse_ridge = mean_squared_error(y_val, y_val_pred_ridge)
mse_full_ridge = mean_squared_error(y_train_full, y_pred_ridge)
print(f"Ridge Regression Validation Mean Squared Error: {mse_ridge}")
print(f"Ridge Regression fulll Mean Squared Error: {mse_full_ridge}")

ValueError: Found input variables with inconsistent numbers of samples: [839, 848]

### Lasso

In [67]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

# Train lasso
lasso = LassoCV(cv=5, max_iter=10000)
lasso.fit(X_train_scaled, y_train.ravel())

# Predict valudation set, and full train set
y_val_pred = lasso.predict(X_val_scaled)
y_pred = lasso.predict(X_scaled)
mse = mean_squared_error(y_val, y_val_pred)
mse_full = mean_squared_error(y_train_full, y_pred)
print(f"Validation Mean Squared Error: {mse}")
print(f"Full Mean Squared Error: {mse_full}")

Validation Mean Squared Error: 58.14802749665964
Full Mean Squared Error: 55.02543916660746


In [68]:
# Determine the features that lasso is actually using
coefficients = lasso.coef_
non_zero_coefs = np.sum(coefficients != 0)
print(f"Number of features selected by Lasso: {non_zero_coefs}")

Number of features selected by Lasso: 50


In [69]:
# Take only those features
X_train_lasso_selected = X_train_scaled[:, coefficients != 0]
X_full_lasso_selected = X_scaled[:, coefficients != 0]
X_val_lasso_selected = X_val_scaled[:, coefficients != 0]
X_test_lasso_selected = X_test_scaled[:, coefficients != 0]

# Retrain the model to the features
lasso_final = LassoCV(cv=5, random_state=42)
lasso_final.fit(X_train_lasso_selected, y_train.ravel())

# Predict again
y_val_pred_final = lasso_final.predict(X_val_lasso_selected)
y_full_pred_final = lasso_final.predict(X_full_lasso_selected)
mse_final = mean_squared_error(y_val, y_val_pred_final)
mse_final_full = mean_squared_error(y_train_full, y_full_pred_final)
print(f"Final Validation Mean Squared Error: {mse_final}")
print(f"Final Full Mean Squared Error: {mse_final_full}")

Final Validation Mean Squared Error: 58.132456641855825
Final Full Mean Squared Error: 54.202995552486115
