In [8]:
import numpy as np 
import pandas as pd 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean



dataset = pd.read_csv("weatherHistory.csv")

# replacing null values by most often item
dataset["Precip Type"] = dataset["Precip Type"].fillna(dataset["Precip Type"].mode()[0])

# formatting column to date format
dataset["Formatted Date"] = pd.to_datetime(dataset["Formatted Date"], format = "%Y-%m-%d %H:%M:%S.%f %z") 

dataset = dataset.drop(["Loud Cover","Daily Summary"], axis=1) 

# apparent temperature is highly correlated to temperature and should be removed
dataset = dataset.drop(["Apparent Temperature (C)"], axis=1) 

dataset["year"] = dataset["Formatted Date"].apply(lambda x: x.year)
dataset["month"] = dataset["Formatted Date"].apply(lambda x: x.month)
dataset["day"] = dataset["Formatted Date"].apply(lambda x: x.day)

dataset = dataset.drop(["Formatted Date"], axis=1)

label_encoder = LabelEncoder()
# Preprocessing 'Summary' column
label_encoder.fit(dataset['Summary'])
dataset["Summary"] = label_encoder.transform(dataset["Summary"])

# Preprocessing 'Precip Type' column
label_encoder.fit(dataset['Precip Type'])
dataset["Precip Type"] = label_encoder.transform(dataset["Precip Type"])


# Target set
y = dataset["Temperature (C)"]

# Features set
X = dataset.drop(["Temperature (C)"], axis = 1)






# standardize data to optimize training
stdScaler = StandardScaler()
stdScaler.fit(X)
X = pd.DataFrame(stdScaler.transform(X), columns=X.columns)

# split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

#  dictionary with list of models
models = { 
    
                "Linear regression": LinearRegression(),
                 "Ridge regression": Ridge(),
                 "Lasso regression": Lasso(),
           "Elastic Net regression": ElasticNet(),
   "K-nearest Neighbors regression": KNeighborsRegressor(),
         "Decision Tree regression": DecisionTreeRegressor(),
         "Random Forest regression": RandomForestRegressor()
    
            

}

# training models
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " - Trained")


predictions = {}
for name, model in models.items():
    y_pred = model.predict(X_test)  
    predictions[name] = y_pred
    
# Computing RMSE & Coefficient of Determination
index = list(predictions.keys())[0]

model_eval = pd.DataFrame(index=[index], columns=["r2 score", "RMSE"])
for key, value in predictions.items():
    model_eval.loc[key, "r2 score"] = "{:.8f}".format(r2_score(y_test, value)) 
    model_eval.loc[key, "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test, value)))
model_eval.reset_index(inplace = True)

# The evaluation shows that the Random forest model achieves better determination score with less error
model_eval.style



Linear regression - Trained
Ridge regression - Trained
Lasso regression - Trained
Elastic Net regression - Trained
K-nearest Neighbors regression - Trained
Decision Tree regression - Trained
Random Forest regression - Trained


Unnamed: 0,index,r2 score,RMSE
0,Linear regression,0.61867399,5.91086442
1,Ridge regression,0.61867394,5.9108648
2,Lasso regression,0.56852796,6.28751611
3,Elastic Net regression,0.53347877,6.53790324
4,K-nearest Neighbors regression,0.91192243,2.84076588
5,Decision Tree regression,0.91811181,2.73913486
6,Random Forest regression,0.96249687,1.85368764


In [9]:
max_val_score = {}

# Cross validation score to accurately measure model quality
lin_model_kfold = KFold(n_splits=5)

print('-----Cross Validation Scores----')
for name, model in models.items():
    score = mean(cross_val_score(model, X, y, cv=lin_model_kfold))
    print('{:s} model: {:.5f}'.format(name, score))
    if not bool(max_val_score):
        max_val_score['model'] = name
        max_val_score['score'] = score
    elif(max_val_score['score'] < score):
        max_val_score['model'] = name
        max_val_score['score'] = score
print('\n\nThe final model that fits this regression problem best is the {:s} model with a score of {:.5f}'.format(max_val_score['model'], max_val_score['score'])) 

#  The above evaluation shows that the model that fits this regression problem best is the Random Forest regression model with a score of 0.86.


-----Cross Validation Scores----
Linear regression model: 0.61320
Ridge regression model: 0.61320
Lasso regression model: 0.56528
Elastic Net regression model: 0.53053
K-nearest Neighbors regression model: 0.77857
Decision Tree regression model: 0.77167
Random Forest regression model: 0.86053


The final model that fits this regression problem best is the Random Forest regression model with a score of 0.86053


NameError: name 'X' is not defined