# Experiment one
- using simple imputer (most-frequent) to handle nan values 
- using one hot enocder to encode the categorical features
- using caple of models with default parameters
- try both normalisation (minmax/standard) and without normalisation 

In [15]:
import pandas as pd 
import numpy as np 
import time
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score , root_mean_squared_error,explained_variance_score,mean_absolute_error
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
#models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor

In [3]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    r2=r2_score(test[1],preds)
    mae=mean_absolute_error(test[1],preds)
    ev=explained_variance_score(test[1],preds)
    rmse=root_mean_squared_error(test[1],preds)
    return {"model":model_name,
           "r2":r2,
            "ev":ev,
            'mae':mae,
            "rmse":rmse,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [4]:
df= pd.read_csv('../data/cleaned_data.csv')

In [5]:
df.head()

Unnamed: 0,summary,precip_type,temperature_(c),humidity,wind_speed_(km/h),wind_bearing_(degrees),visibility_(km),loud_cover,pressure_(millibars)
0,Partly Cloudy,rain,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,Partly Cloudy,rain,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,Mostly Cloudy,rain,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,Partly Cloudy,rain,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,Mostly Cloudy,rain,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51


In [6]:
df.isna().sum()

summary                     0
precip_type               517
temperature_(c)             0
humidity                    0
wind_speed_(km/h)           0
wind_bearing_(degrees)      0
visibility_(km)             0
loud_cover                  0
pressure_(millibars)        0
dtype: int64

In [7]:
imputer=make_column_transformer((SimpleImputer(strategy='most_frequent'),[1]),remainder='passthrough')

In [8]:
enocoder=make_column_transformer((OneHotEncoder(sparse_output=False,drop='first'),[0,1]),remainder='passthrough')

In [9]:
preprocessor=make_pipeline(imputer,enocoder)

In [10]:
X=df.drop("temperature_(c)",axis='columns')
y=df['temperature_(c)']


In [11]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
X_train_processed=preprocessor.fit_transform(Xtrain)
X_test_processed=preprocessor.transform(Xtest)

In [14]:
train=(X_train_processed,ytrain)
test=(X_test_processed,ytest)


In [16]:
models=[LinearRegression(),RandomForestRegressor()
        ,GradientBoostingRegressor(),AdaBoostRegressor(),
        HistGradientBoostingRegressor(),BaggingRegressor(),KNeighborsRegressor(),DecisionTreeRegressor(),
        XGBRegressor()]

## without normalisation


In [None]:
def evaluate_models(models:list,train:tuple,test:tuple)->pd.DataFrame:
    result=[]
    for model in models:
        print(f">> {model.__class__.__name__}")
        model_result=evaluate_model(model=model,model_name=model.__class__.__name__,train=train,test=test)
        result.append(model_result)
    return pd.DataFrame(result)
    

In [20]:
result= evaluate_models(models,train,test)

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor
>> XGBRegressor


In [21]:
result["normalisation"]='without'

In [22]:
result

Unnamed: 0,model,r2,ev,mae,rmse,training_time(ms),testing_time(ms),normalisation
0,LinearRegression,0.613949,0.613949,4.886811,5.964746,54.577589,8.0688,without
1,RandomForestRegressor,0.765099,0.765125,3.523486,4.652779,40761.748791,597.583294,without
2,GradientBoostingRegressor,0.73385,0.73385,3.94011,4.952597,8340.829611,34.037828,without
3,AdaBoostRegressor,0.631778,0.63403,4.846372,5.825385,6173.130274,238.43956,without
4,HistGradientBoostingRegressor,0.756128,0.756128,3.689902,4.740787,786.143064,82.454443,without
5,BaggingRegressor,0.742338,0.742384,3.690536,4.872978,4073.552132,136.718512,without
6,KNeighborsRegressor,0.412597,0.414553,5.693715,7.357622,136.784077,229.502201,without
7,DecisionTreeRegressor,0.54277,0.542771,4.688695,6.491376,617.469072,12.763262,without
8,XGBRegressor,0.764159,0.764159,3.611843,4.662077,535.102844,24.642706,without


## with normalisation

### minamaxscaler

In [23]:
scaler=MinMaxScaler()
X_train_processed=scaler.fit_transform(preprocessor.fit_transform(Xtrain))
X_test_processed=scaler.transform(preprocessor.fit_transform(Xtest))

In [24]:
train=(X_train_processed,ytrain)
test=(X_test_processed,ytest)


In [25]:
result_min_max=evaluate_models(models,train,test)
    

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor
>> XGBRegressor


In [34]:
result_min_max["normalisation"]='minmax'

In [35]:
result_min_max

Unnamed: 0,model,r2,ev,mae,rmse,training_time(ms),testing_time(ms),normalisation
0,LinearRegression,0.613949,0.613949,4.886811,5.964746,18.681049,0.448227,minmax
1,RandomForestRegressor,0.764817,0.764846,3.526974,4.655563,38765.964031,584.209204,minmax
2,GradientBoostingRegressor,0.733877,0.733877,3.940052,4.952346,8572.795868,29.089928,minmax
3,AdaBoostRegressor,0.641178,0.643134,4.769777,5.750547,2491.767406,26.433468,minmax
4,HistGradientBoostingRegressor,0.756553,0.756553,3.689776,4.736659,576.288223,78.017712,minmax
5,BaggingRegressor,0.744022,0.744058,3.67314,4.857029,3781.356573,67.10124,minmax
6,KNeighborsRegressor,0.642453,0.642474,4.431054,5.740317,98.927021,4011.75189,minmax
7,DecisionTreeRegressor,0.535682,0.535682,4.719469,6.541501,612.983942,8.545399,minmax
8,XGBRegressor,0.764159,0.764159,3.611843,4.662077,485.860109,16.670465,minmax


### StandardScaler

In [28]:
scaler=StandardScaler()
X_train_processed=scaler.fit_transform(preprocessor.fit_transform(Xtrain))
X_test_processed=scaler.transform(preprocessor.fit_transform(Xtest))

In [29]:
train=(X_train_processed,ytrain)
test=(X_test_processed,ytest)

In [30]:
result_standard= evaluate_models(models,train,test)

>> LinearRegression
>> RandomForestRegressor
>> GradientBoostingRegressor
>> AdaBoostRegressor
>> HistGradientBoostingRegressor
>> BaggingRegressor
>> KNeighborsRegressor
>> DecisionTreeRegressor
>> XGBRegressor


In [37]:
result_standard["normalisation"]='standard'

In [38]:
result_standard

Unnamed: 0,model,r2,ev,mae,rmse,training_time(ms),testing_time(ms),normalisation
0,LinearRegression,0.613949,0.613949,4.886811,5.964746,15.975952,0.396013,standard
1,RandomForestRegressor,0.765548,0.765578,3.525393,4.648329,38102.152824,577.699423,standard
2,GradientBoostingRegressor,0.733877,0.733877,3.94004,4.952345,8352.246046,29.322863,standard
3,AdaBoostRegressor,0.640326,0.64246,4.771294,5.757367,2304.634571,22.363424,standard
4,HistGradientBoostingRegressor,0.755727,0.755727,3.696863,4.744683,590.8494,81.080198,standard
5,BaggingRegressor,0.742675,0.742679,3.681497,4.869789,3720.239639,64.966917,standard
6,KNeighborsRegressor,0.654767,0.654922,4.34825,5.640604,101.540565,2218.598127,standard
7,DecisionTreeRegressor,0.539347,0.539347,4.70303,6.515634,591.415405,9.198666,standard
8,XGBRegressor,0.764159,0.764159,3.611843,4.662077,280.933142,27.636528,standard


In [41]:
exp1_result=pd.concat((result,result_min_max,result_standard))

In [42]:
exp1_result.to_csv("../result/exp1_result.csv",index=False)