In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,make_scorer
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,ParameterGrid,KFold,cross_val_score

In [2]:
df=pd.read_csv('processed_outliers.csv')
print('Data shape:',df.shape)

Data shape: (519, 7)


In [3]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
print(X.shape)
print(y.shape)

(519, 6)
(519,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
param_grid = {
    'degree': range(2,11),
    'interaction_only': [True, False],
    'include_bias': [True, False]
}
best_params={}
best_r2=-1e9

for params in ParameterGrid(param_grid):
    poly_features = PolynomialFeatures(**params)
    X_train_poly = poly_features.fit_transform(X_train)
    X_train_poly = StandardScaler().fit_transform(X_train_poly)
    reg=LinearRegression()
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_val_score(reg, X_train_poly, y_train, cv=kf, scoring=make_scorer(r2_score))
    avg_r2=np.mean(scores)
    if(avg_r2<1 and avg_r2>0 and avg_r2>best_r2):
        best_r2=avg_r2
        best_params=params

print(best_r2)
print(best_params)

0.7192288021787714
{'degree': 3, 'include_bias': False, 'interaction_only': False}


In [5]:
poly_features = PolynomialFeatures(**best_params)
X_train_poly = poly_features.fit_transform(X_train)
sc=StandardScaler()
X_train_poly = sc.fit_transform(X_train_poly)
reg=LinearRegression()
reg.fit(X_train_poly,y_train)
print('Train r2:',r2_score(y_train,reg.predict(X_train_poly)))

Train r2: 0.8730565633112294


In [6]:
X_test_poly=poly_features.transform(X_test)
X_test_poly=sc.transform(X_test_poly)
print('Test r2:',r2_score(y_test,reg.predict(X_test_poly)))

Test r2: 0.8726340767825405
