# Experiment Two
- using simple imputer (most-frequent) to handle nan values 
- using one hot enocder to encode the categorical features
- unsing tree based models with diffrent `n_estimator` values


In [2]:
import pandas as pd 
import numpy as np 
import time
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score , root_mean_squared_error,explained_variance_score,mean_absolute_error
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
#models


from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor

In [3]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    r2=r2_score(test[1],preds)
    mae=mean_absolute_error(test[1],preds)
    ev=explained_variance_score(test[1],preds)
    rmse=root_mean_squared_error(test[1],preds)
    return {"model":model_name,
           "r2":r2,
            "ev":ev,
            'mae':mae,
            "rmse":rmse,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [4]:
imputer=make_column_transformer((SimpleImputer(strategy='most_frequent'),[1]),remainder='passthrough')
enocoder=make_column_transformer((OneHotEncoder(sparse_output=False,drop='first'),[0,1]),remainder='passthrough')
preprocessor=make_pipeline(imputer,enocoder)

In [5]:
df=pd.read_csv("../data/cleaned_data.csv")

In [6]:
X=df.drop("temperature_(c)",axis='columns')
y=df['temperature_(c)']


In [7]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
X_train_processed=preprocessor.fit_transform(Xtrain)
X_test_processed=preprocessor.transform(Xtest)
train=(X_train_processed,ytrain)
test=(X_test_processed,ytest)

In [9]:
n_estimators=[50,100,200,500]
models=[RandomForestRegressor(),GradientBoostingRegressor(),AdaBoostRegressor(),XGBRegressor()]

In [10]:
def evaluate_models(models:list,estimator:int,train:tuple,test:tuple)->pd.DataFrame:
    result=[]
    for model in models:
        print(f">>>> {model.__class__.__name__}")
        model.n_estimators=estimator
        model_result=evaluate_model(model=model,model_name=model.__class__.__name__,train=train,test=test)
        result.append(model_result)
    result=pd.DataFrame(result)
    result["n_estimator"]=estimator
    return result
    

In [11]:
def try_n_estimators(models:list,n_estimators:list,train:tuple,test:tuple):
    results=[]
    for estim in n_estimators:
        print(f">>{estim}")
        results.append(evaluate_models(models,estim,train,test))
        
    return pd.concat(results)

In [12]:
rs=try_n_estimators(models,n_estimators=n_estimators,train=train,test=test)

>>50
>>>> RandomForestRegressor
>>>> GradientBoostingRegressor
>>>> AdaBoostRegressor
>>>> XGBRegressor
>>100
>>>> RandomForestRegressor
>>>> GradientBoostingRegressor
>>>> AdaBoostRegressor
>>>> XGBRegressor
>>200
>>>> RandomForestRegressor
>>>> GradientBoostingRegressor
>>>> AdaBoostRegressor
>>>> XGBRegressor
>>500
>>>> RandomForestRegressor
>>>> GradientBoostingRegressor
>>>> AdaBoostRegressor
>>>> XGBRegressor


In [13]:
rs

Unnamed: 0,model,r2,ev,mae,rmse,training_time(ms),testing_time(ms),n_estimator
0,RandomForestRegressor,0.762912,0.762927,3.544007,4.674385,18928.482294,298.428297,50
1,GradientBoostingRegressor,0.72205,0.72205,4.052338,5.061196,4187.723875,21.774054,50
2,AdaBoostRegressor,0.636869,0.639029,4.813248,5.784971,5234.025478,208.335876,50
3,XGBRegressor,0.76121,0.761211,3.646028,4.691127,174.039364,16.143322,50
0,RandomForestRegressor,0.766034,0.766068,3.518368,4.643502,37520.561457,585.161209,100
1,GradientBoostingRegressor,0.733877,0.733877,3.940052,4.952346,8341.814041,33.921003,100
2,AdaBoostRegressor,0.636989,0.639116,4.807844,5.784011,5106.911182,196.68746,100
3,XGBRegressor,0.764159,0.764159,3.611843,4.662077,356.902361,34.499645,100
0,RandomForestRegressor,0.766644,0.766667,3.51496,4.637446,75135.210752,1171.362638,200
1,GradientBoostingRegressor,0.743298,0.743298,3.849811,4.863896,16603.987932,55.552959,200


In [14]:
rs.to_csv("../result/exp2.csv",index=False)