### Installing packages

In [1]:
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Importing packages and loading csv files

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold


In [16]:
train_df=pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,target,f1,f2,f3,f4,f5,f6
0,27.4,47.2,40.2,-16.0,13,7.9,31.7
1,15.6,40.6,21.9,-11.5,20,5.4,16.5
2,23.6,47.7,27.9,-12.6,46,6.7,22.4
3,38.9,82.7,95.5,-28.5,26,13.8,55.4
4,44.5,71.0,141.5,-23.8,85,12.0,80.9


In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  10999 non-null  float64
 1   f1      10999 non-null  float64
 2   f2      10999 non-null  float64
 3   f3      10999 non-null  float64
 4   f4      10999 non-null  int64  
 5   f5      10999 non-null  float64
 6   f6      10999 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 601.6 KB


In [18]:
test_df=pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6
0,1,129.3,663.7,-75.3,52,29.3,298.0
1,2,143.1,687.3,-82.6,63,30.7,306.2
2,3,52.3,32.0,-10.8,39,7.1,24.9
3,4,25.1,0.5,-5.6,8,3.3,0.5
4,5,112.7,701.3,-59.2,40,24.0,305.2


In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5383 entries, 0 to 5382
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      5383 non-null   int64  
 1   f1      5383 non-null   float64
 2   f2      5383 non-null   float64
 3   f3      5383 non-null   float64
 4   f4      5383 non-null   int64  
 5   f5      5383 non-null   float64
 6   f6      5383 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 294.5 KB


### Preprocessing data

In [23]:
scale=StandardScaler()

X_train=train_df[['f1','f2','f3','f4','f5','f6']]
y_train=train_df[['target']].values.ravel()

X_test = test_df[['f1','f2','f3','f4','f5','f6']]

poly=PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
X_train_poly=poly.fit_transform(X_train)
X_test_poly=poly.transform(X_test)

In [24]:
X_train_scaled=scale.fit_transform(X_train_poly)
X_test_scaled=scale.transform(X_test_poly)

selector=SelectKBest(score_func=f_regression, k=8) #Selecting top 8 features
X_train_selected=selector.fit_transform(X_train_scaled,y_train)
X_test_selected=selector.transform(X_test_scaled)

### Training and choosing best model

In [25]:
cv=KFold(n_splits=5,shuffle=True,random_state=42)#to be used for GridSearchCV 'cv' parameter

In [26]:
#Linear Regression 

model1 =LinearRegression()
#print(model1.get_params())
parameters1 = {'fit_intercept':[True,False]}

linear_grid = GridSearchCV(model1 ,parameters1,scoring="r2", cv=cv)
linear_grid.fit(X_train_selected, y_train)
# print(linear_grid.best_params_)

linear_best=linear_grid.best_estimator_              

y_check1=linear_best.predict(X_train_selected)

In [27]:
#Ridge Regression

model2=Ridge()
alpha_lis1= np.logspace(-3,2,10)#10^-3 to 10^2 logarithmically spaced to get decent range of alpha
solver_lis=['auto','svd','cholesky']
parameters2={'alpha':alpha_lis1,'solver':solver_lis,'max_iter':[5000]}

ridge_grid=GridSearchCV(model2,parameters2,scoring="r2",cv=cv)
ridge_grid.fit(X_train_selected,y_train)
# print(ridge_grid.best_params_)

ridge_best=ridge_grid.best_estimator_

y_check2=ridge_best.predict(X_train_selected)

In [28]:
#Lasso Regression

model3=Lasso()

#print(model3.get_params())

alpha_lis2= np.logspace(-3,2,10)
parameters3={'alpha':alpha_lis2,'max_iter':[5000]}

lasso_grid=GridSearchCV(model3,parameters3,scoring="r2",cv=cv)
lasso_grid.fit(X_train_selected,y_train)
# print(lasso_grid.best_params_)

lasso_best=lasso_grid.best_estimator_

y_check3=lasso_best.predict(X_train_selected)

In [29]:
#Random Forest Regressor

model4 = RandomForestRegressor()

#print(model4.get_params())

parameters4 = {'n_estimators':[50, 100, 200],'max_depth':[5,10,15]} 

rf_grid=GridSearchCV(model4,parameters4,scoring="r2",cv=cv)
rf_grid.fit(X_train_selected,y_train)
# print(rf_grid.best_params_)

rf_best=rf_grid.best_estimator_

y_check4=rf_best.predict(X_train_selected)

In [30]:
linear_r2=r2_score(y_train, y_check1)
ridge_r2=r2_score(y_train, y_check2)
lasso_r2=r2_score(y_train, y_check3)
rf_r2=r2_score(y_train, y_check4)

mod_dict={'linear':linear_r2,'ridge':ridge_r2,'lasso':lasso_r2,'random forest':rf_r2}
max_r2=max(mod_dict.values())
candidates=[x for x in mod_dict if mod_dict[x]==max_r2]
final_mod=candidates[0]

match final_mod:
    case 'linear': 
        y_predict=linear_best.predict(X_test_selected)
    case 'ridge':
        y_predict=ridge_best.predict(X_test_selected)
    case 'lasso':
        y_predict=lasso_best.predict(X_test_selected)
    case 'random forest':
        y_predict=rf_best.predict(X_test_selected)

print('Max R2: ',max_r2)
print(final_mod)

Max R2:  0.8829134773928743
random forest


### Save results to csv file

In [31]:
id_lis=[x for x in test_df['id']]

target_lis=list(y_predict)
columns={'id':id_lis,'target':target_lis}

final_df=pd.DataFrame(columns)
final_df

Unnamed: 0,id,target
0,1,5.783883
1,2,3.629699
2,3,30.851608
3,4,12.042200
4,5,1.877615
...,...,...
5378,5379,40.877504
5379,5380,27.270707
5380,5381,33.914804
5381,5382,38.232634


In [32]:
final_df.to_csv('hopeful_predictions.csv',index=False)