In [3]:
import pandas as pd
import numpy as np


df = pd.read_csv('data_filtered.csv')
X = df.drop('critical_temp',axis=1)
y = df['critical_temp']

corr = pd.DataFrame(df.corr()['critical_temp'])
corr['abs'] = np.abs(corr['critical_temp'])
corr = corr.sort_values(by='abs',ascending=False).drop('abs',axis=1).dropna().reset_index()
corr = corr.rename(columns={'index':'feature'}).loc[1:]

In [13]:
import time
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import Lasso,Ridge,ElasticNet, BayesianRidge, LinearRegression
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

models = {'OLS':LinearRegression(),'ElasticNet':ElasticNet(),
          'BayesianRidge':BayesianRidge(),'Lasso':Lasso(),
         'Ridge':Ridge(),'KNN':neighbors.KNeighborsRegressor(),
         'rff':RandomForestRegressor()}
e=0.1515151515151516
intercept = 1.5542



def model_performance(X,y,i):
    times =[]
    keys = []
    mean_squared_errors = []
    R2_scores = []
    features = X.columns
    y_t = y.apply(lambda x: (x**e)-intercept)
    X_train, X_test, y_train, y_test = train_test_split(X, y_t, test_size=0.3, random_state=i)
    y_test0 = y_test.apply(lambda x: (x+intercept)**(1/e))
    for k,v in models.items():
        model = v
        t0=time.time()
        model.fit(X_train, y_train)
        train_time = time.time()-t0
        t1 = time.time()
        pred = model.predict(X_test)
        predict_time = time.time()-t1
        pred = pd.Series(pred).apply(lambda x: (x+intercept)**(1/e))
        Time_total = train_time+predict_time
        times.append(Time_total)
        R2_scores.append(r2_score(y_test0,pred))
        mean_squared_errors.append(mean_squared_error(y_test0,pred))
        keys.append(k)
    table = pd.DataFrame({'model':keys, 'RMSE':mean_squared_errors,'R2 score':R2_scores,'time':times})
    table['features'] = pd.Series([len(features) for i in range(len(R2_scores))])
    table['RMSE'] = table['RMSE'].apply(lambda x: np.sqrt(x))
    return table

In [None]:
model_performance(X,y)

In [15]:
tables = pd.DataFrame()
for index in corr.index:
    features = list(corr['feature'].loc[:index])
    _ = df[features]
    tables = pd.concat([tables,model_performance(_,y,i=index)],axis=0)
tables.to_csv('seven_regressor_results_fitted.csv')
tables.sort_values(by='RMSE',ascending=True)

Unnamed: 0,R2 score,RMSE,model,time,features
6,0.910412,10.300083,rff,1.652270,30
6,0.909193,10.353291,rff,1.324260,21
6,0.909333,10.363255,rff,1.895687,33
6,0.909493,10.385952,rff,1.181619,18
6,0.907181,10.406568,rff,1.015188,16
6,0.908327,10.419545,rff,2.053712,35
6,0.907216,10.455691,rff,1.585692,29
6,0.907215,10.457843,rff,1.564921,28
6,0.905499,10.513869,rff,1.412847,24
6,0.905109,10.526264,rff,1.905209,32
