### Installing packages

In [1]:
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Importing packages and loading csv files

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
train_df=pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,target,f1,f2,f3,f4,f5,f6
0,27.4,47.2,40.2,-16.0,13,7.9,31.7
1,15.6,40.6,21.9,-11.5,20,5.4,16.5
2,23.6,47.7,27.9,-12.6,46,6.7,22.4
3,38.9,82.7,95.5,-28.5,26,13.8,55.4
4,44.5,71.0,141.5,-23.8,85,12.0,80.9


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  10999 non-null  float64
 1   f1      10999 non-null  float64
 2   f2      10999 non-null  float64
 3   f3      10999 non-null  float64
 4   f4      10999 non-null  int64  
 5   f5      10999 non-null  float64
 6   f6      10999 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 601.6 KB


In [4]:
test_df=pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6
0,1,129.3,663.7,-75.3,52,29.3,298.0
1,2,143.1,687.3,-82.6,63,30.7,306.2
2,3,52.3,32.0,-10.8,39,7.1,24.9
3,4,25.1,0.5,-5.6,8,3.3,0.5
4,5,112.7,701.3,-59.2,40,24.0,305.2


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5383 entries, 0 to 5382
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      5383 non-null   int64  
 1   f1      5383 non-null   float64
 2   f2      5383 non-null   float64
 3   f3      5383 non-null   float64
 4   f4      5383 non-null   int64  
 5   f5      5383 non-null   float64
 6   f6      5383 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 294.5 KB


### Preprocessing data

In [6]:
scale=StandardScaler()

X_train=train_df[['f1','f2','f3','f4','f5','f6']]

X_train_scaled=scale.fit_transform(X_train)
X_train=pd.DataFrame(X_train_scaled,columns=X_train.columns)

y_train=train_df[['target']]

y_train_shaped = y_train.values.reshape(-1,1)
y_train=pd.DataFrame(y_train_shaped,columns=y_train.columns)

X_train

Unnamed: 0,f1,f2,f3,f4,f5,f6
0,-1.182322,-0.564646,0.069845,-0.942997,-1.023810,-0.682701
1,-1.306813,-0.579054,0.079276,-0.798131,-1.215486,-0.722807
2,-1.172891,-0.574330,0.076970,-0.260058,-1.115815,-0.707239
3,-0.512707,-0.521108,0.043649,-0.673960,-0.571457,-0.620168
4,-0.733397,-0.484892,0.053498,0.547052,-0.709463,-0.552886
...,...,...,...,...,...,...
10994,0.089003,-0.055887,-0.014403,2.451003,0.264247,0.123104
10995,-0.661720,-0.515675,0.052451,-0.736046,-0.686462,-0.602490
10996,0.241788,0.327612,0.041553,-0.301448,0.172243,0.527853
10997,1.416914,0.668359,-0.100327,0.464271,1.345296,0.893289


In [7]:
X_test = test_df[['f1','f2','f3','f4','f5','f6']]

X_test_scaled=scale.transform(X_test)
X_test=pd.DataFrame(X_test_scaled,columns=X_test.columns)

X_test

Unnamed: 0,f1,f2,f3,f4,f5,f6
0,0.366279,-0.073759,-0.054431,-0.135887,0.616930,0.019938
1,0.626580,-0.055178,-0.069729,0.091759,0.724268,0.041574
2,-1.086124,-0.571102,0.080743,-0.404924,-1.085146,-0.700643
3,-1.599180,-0.595902,0.091640,-1.046473,-1.376493,-0.765023
4,0.053164,-0.044156,-0.020690,-0.384229,0.210578,0.038935
...,...,...,...,...,...,...
5378,-0.322197,-0.506385,0.040086,-0.467009,-0.433451,-0.593783
5379,-0.473096,-0.521816,0.041972,-0.860216,-0.571457,-0.620168
5380,-0.399533,-0.380179,0.043229,-0.529094,-0.433451,-0.406448
5381,-0.612678,-0.387895,0.042182,0.795393,-0.433451,-0.365023


### Training and choosing best model

In [8]:
#Linear Regression 

model1 =LinearRegression()
#print(model1.get_params())
parameters1 = {'fit_intercept':[True,False], 
'n_jobs':[1,5,10,None]}

linear_grid = GridSearchCV(model1, parameters1, cv=5)
linear_grid.fit(X_train, y_train)
# print(linear_grid.best_params_)

linear_best=linear_grid.best_estimator_              
linear_best.fit(X_train,y_train)

y_check1=linear_best.predict(X_train)

In [9]:
#Ridge Regression

model2=Ridge()

alpha_lis1= np.arange(0.0,50.0,0.1)
solver_lis=['auto','svd','cholesky']
parameters2={'alpha':alpha_lis1,'fit_intercept':[True,False],'solver':solver_lis}

ridge_grid=GridSearchCV(model2,parameters2,scoring="neg_mean_squared_error",cv=5)
ridge_grid.fit(X_train,y_train)
# print(ridge_grid.best_params_)

ridge_best=ridge_grid.best_estimator_
ridge_best.fit(X_train,y_train)

y_check2=ridge_best.predict(X_train)

In [10]:
#Lasso Regression

model3=Lasso()

#print(model3.get_params())

alpha_lis2= np.arange(0.1,50.0,0.1)
parameters3={'alpha':alpha_lis2,'fit_intercept':[True,False],'precompute':[True,False]}

lasso_grid=GridSearchCV(model3,parameters3,scoring="neg_mean_squared_error",cv=5)
lasso_grid.fit(X_train,y_train)
# print(lasso_grid.best_params_)

lasso_best=lasso_grid.best_estimator_
lasso_best.fit(X_train,y_train)

y_check3=lasso_best.predict(X_train)

In [11]:
#Random Forest Regressor

model4 = RandomForestRegressor()

#print(model4.get_params())

parameters4 = {'n_estimators':[5,10,25,50], 
'max_depth':[2,3,6,10],'max_leaf_nodes':[2,3,6,10]} 

rf_grid=GridSearchCV(model4,parameters4,scoring="neg_mean_squared_error",cv=5)
rf_grid.fit(X_train,np.ravel(y_train))
# print(rf_grid.best_params_)

rf_best=rf_grid.best_estimator_
rf_best.fit(X_train,np.ravel(y_train))

y_check4=rf_best.predict(X_train)

In [12]:
linear_r2=r2_score(y_train, y_check1)
ridge_r2=r2_score(y_train, y_check2)
lasso_r2=r2_score(y_train, y_check3)
rf_r2=r2_score(y_train, y_check4)

mod_dict={'linear':linear_r2,'ridge':ridge_r2,'lasso':lasso_r2,'random forest':rf_r2}
max_r2=max(mod_dict.values())
candidates=[x for x in mod_dict if mod_dict[x]==max_r2]
final_mod=candidates[0]

match final_mod:
    case 'linear': 
        y_predict=linear_best.predict(X_test)
    case 'ridge':
        y_predict=ridge_best.predict(X_test)
    case 'lasso':
        y_predict=lasso_best.predict(X_test)
    case 'random forest':
        y_predict=rf_best.predict(X_test)

print('Max R2: ',max_r2)
print(final_mod)

Max R2:  0.2758539653497455
random forest


### Save results to csv file

In [13]:
id_lis=[x for x in test_df['id']]

target_lis=list(y_predict)
columns={'id':id_lis,'target':target_lis}

final_df=pd.DataFrame(columns)
final_df

Unnamed: 0,id,target
0,1,10.279322
1,2,7.322690
2,3,27.848146
3,4,22.404559
4,5,18.351134
...,...,...
5378,5379,28.266862
5379,5380,22.404559
5380,5381,30.878480
5381,5382,30.692586


In [14]:
final_df.to_csv('predictions3.csv',index=False)