# Experiment one
- using simple imputer (most-frequent) to handle nan values 
- using one hot enocder to encode the categorical features
- using caple of models with default parameters
- try both normalisation (minmax/standard) and without normalisation 

In [157]:
import pandas as pd 
import numpy as np 
import time
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score , root_mean_squared_error,explained_variance_score,mean_absolute_error
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
#models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor

In [172]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    r2=r2_score(test[1],preds)
    mae=mean_absolute_error(test[1],preds)
    ev=explained_variance_score(test[1],preds)
    rmse=root_mean_squared_error(test[1],preds)
    return {"model":model_name,
           "r2":r2,
            "ev":ev,
            'mae':mae,
            "rmse":rmse,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [18]:
df= pd.read_csv('../data/cleaned_data.csv')

In [20]:
df.head()

Unnamed: 0,summary,precip_type,temperature_(c),humidity,wind_speed_(km/h),wind_bearing_(degrees),visibility_(km),loud_cover,pressure_(millibars)
0,Partly Cloudy,rain,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,Partly Cloudy,rain,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,Mostly Cloudy,rain,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,Partly Cloudy,rain,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,Mostly Cloudy,rain,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51


In [24]:
df.isna().sum()

summary                     0
precip_type               517
temperature_(c)             0
humidity                    0
wind_speed_(km/h)           0
wind_bearing_(degrees)      0
visibility_(km)             0
loud_cover                  0
pressure_(millibars)        0
dtype: int64

In [67]:
imputer=make_column_transformer((SimpleImputer(strategy='most_frequent'),[1]),remainder='passthrough')

In [132]:
enocoder=make_column_transformer((OneHotEncoder(sparse_output=False,drop='first'),[0,1]),remainder='passthrough')

In [134]:
preprocessor=make_pipeline(imputer,enocoder)

In [89]:
X=df.drop("temperature_(c)",axis='columns')
y=df['temperature_(c)']


In [91]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

In [136]:
X_train_processed=preprocessor.fit_transform(Xtrain)
X_test_processed=preprocessor.transform(Xtest)

In [184]:
models=[LinearRegression(),RandomForestRegressor(),GradientBoostingRegressor(),AdaBoostRegressor(),HistGradientBoostingRegressor(),BaggingRegressor(),KNeighborsRegressor(),DecisionTreeRegressor(),]

## without normalisation


In [188]:
result=[]
for model in models:
    print(f">> {model.__class__.__name__}")
    model_result=evaluate_model(model=model,model_name=model.__class__.__name__,train=(X_train_processed,ytrain),test=(X_test_processed,ytest))
    model_result['params']=model.get_params()
    result.append(model_result)
    

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor


In [190]:
result

[{'model': 'LinearRegression',
  'r2': 0.6139488810520279,
  'ev': 0.6139491501644017,
  'mae': 4.88681120176865,
  'rmse': 5.964745703061994,
  'training_time(ms)': 120.44072151184082,
  'testing_time(ms)': 10.970830917358398,
  'params': {'copy_X': True,
   'fit_intercept': True,
   'n_jobs': None,
   'positive': False}},
 {'model': 'RandomForestRegressor',
  'r2': 0.765042730216422,
  'ev': 0.7650723618771509,
  'mae': 3.5284242320828936,
  'rmse': 4.653331245464801,
  'training_time(ms)': 53897.356033325195,
  'testing_time(ms)': 809.8082542419434,
  'params': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'criterion': 'squared_error',
   'max_depth': None,
   'max_features': 1.0,
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'monotonic_cst': None,
   'n_estimators': 100,
   'n_jobs': None,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
  

In [192]:
import json 


In [202]:
with open("../result/exp_1_non_normalisation.json",'w')as f:
    json.dump(result,f,indent=4)

## with normalisation

### minamaxscaler

In [209]:
scaler=MinMaxScaler()
X_train_processed=scaler.fit_transform(preprocessor.fit_transform(Xtrain))
X_test_processed=scaler.transform(preprocessor.fit_transform(Xtest))

In [211]:
result=[]
for model in models:
    print(f">> {model.__class__.__name__}")
    model_result=evaluate_model(model=model,model_name=model.__class__.__name__,train=(X_train_processed,ytrain),test=(X_test_processed,ytest))
    model_result['params']=model.get_params()
    result.append(model_result)
    

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor


In [212]:
result

[{'model': 'LinearRegression',
  'r2': 0.6139488810520279,
  'ev': 0.6139491501644019,
  'mae': 4.886811201768657,
  'rmse': 5.964745703061994,
  'training_time(ms)': 54.90517616271973,
  'testing_time(ms)': 0.9975433349609375,
  'params': {'copy_X': True,
   'fit_intercept': True,
   'n_jobs': None,
   'positive': False}},
 {'model': 'RandomForestRegressor',
  'r2': 0.7642967315884153,
  'ev': 0.7643244196369234,
  'mae': 3.5278622618588216,
  'rmse': 4.660712646458326,
  'training_time(ms)': 51508.06260108948,
  'testing_time(ms)': 923.039436340332,
  'params': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'criterion': 'squared_error',
   'max_depth': None,
   'max_features': 1.0,
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'monotonic_cst': None,
   'n_estimators': 100,
   'n_jobs': None,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   

In [213]:
with open("../result/exp_1_normalisation_minmax.json",'w')as f:
    json.dump(result,f,indent=4)

### StandardScaler

In [218]:
scaler=StandardScaler()
X_train_processed=scaler.fit_transform(preprocessor.fit_transform(Xtrain))
X_test_processed=scaler.transform(preprocessor.fit_transform(Xtest))

In [220]:
result=[]
for model in models:
    print(f">> {model.__class__.__name__}")
    model_result=evaluate_model(model=model,model_name=model.__class__.__name__,train=(X_train_processed,ytrain),test=(X_test_processed,ytest))
    model_result['params']=model.get_params()
    result.append(model_result)

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor


In [221]:
with open("../result/exp_1_normalisation_standard.json",'w')as f:
    json.dump(result,f,indent=4)

In [222]:
result

[{'model': 'LinearRegression',
  'r2': 0.613948881052028,
  'ev': 0.6139491501644017,
  'mae': 4.886811201768627,
  'rmse': 5.964745703061993,
  'training_time(ms)': 69.5958137512207,
  'testing_time(ms)': 0.9837150573730469,
  'params': {'copy_X': True,
   'fit_intercept': True,
   'n_jobs': None,
   'positive': False}},
 {'model': 'RandomForestRegressor',
  'r2': 0.7653985477222763,
  'ev': 0.7654319518128799,
  'mae': 3.5249917083767714,
  'rmse': 4.649806425711224,
  'training_time(ms)': 52370.85223197937,
  'testing_time(ms)': 789.0846729278564,
  'params': {'bootstrap': True,
   'ccp_alpha': 0.0,
   'criterion': 'squared_error',
   'max_depth': None,
   'max_features': 1.0,
   'max_leaf_nodes': None,
   'max_samples': None,
   'min_impurity_decrease': 0.0,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'monotonic_cst': None,
   'n_estimators': 100,
   'n_jobs': None,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   '