### Research Question:  Are decision trees and random forest good models for predicting rain in Australia for the next day?

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, validation_curve
from sklearn.ensemble import RandomForestClassifier

In [2]:
rainData = pd.read_csv("weatherAUS.csv")
print(rainData.isnull().sum())
rainData

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


# Data Pre-processing

In [3]:
rainDataSelect = rainData[["MinTemp", "MaxTemp", "Rainfall", "WindSpeed9am", 
                           "Temp9am", "RainToday", 
                           "RainTomorrow"]].copy().dropna().reset_index(drop=True)
rainDataSelect["RainToday"]=np.where(rainDataSelect["RainToday"]=="Yes",1, 0)
rainDataSelect["RainTomorrow"]=np.where(rainDataSelect["RainTomorrow"]=="Yes",1, 0)
rainDataSelect

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,Temp9am,RainToday,RainTomorrow
0,13.4,22.9,0.6,20.0,16.9,0,0
1,7.4,25.1,0.0,4.0,17.2,0,0
2,12.9,25.7,0.0,19.0,21.0,0,0
3,9.2,28.0,0.0,11.0,18.1,0,0
4,17.5,32.3,1.0,7.0,17.8,0,0
...,...,...,...,...,...,...,...
138907,3.5,21.8,0.0,15.0,9.4,0,0
138908,2.8,23.4,0.0,13.0,10.1,0,0
138909,3.6,25.3,0.0,13.0,10.9,0,0
138910,5.4,26.9,0.0,9.0,12.5,0,0


In [4]:
continuousVars = ["MinTemp", "MaxTemp", "Rainfall", "WindSpeed9am", "Temp9am"]
scaler = MinMaxScaler()
rainNorm = pd.DataFrame(scaler.fit_transform(rainDataSelect[continuousVars]),
                        columns = continuousVars)
rainNorm = pd.concat([rainNorm, 
                      rainDataSelect[[i for i in rainDataSelect.columns if i not in continuousVars]]], axis=1)
rainNorm.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,Temp9am,RainToday,RainTomorrow
0,0.516509,0.523629,0.001617,0.153846,0.508439,0,0
1,0.375,0.565217,0.0,0.030769,0.514768,0,0
2,0.504717,0.57656,0.0,0.146154,0.594937,0,0
3,0.417453,0.620038,0.0,0.084615,0.533755,0,0
4,0.613208,0.701323,0.002695,0.053846,0.527426,0,0


# Data Splitting 

In [5]:
X_train,X_test,y_train, y_test = train_test_split(rainNorm.drop("RainTomorrow", axis=1),
                                                  rainNorm.RainTomorrow, test_size=.3, 
                                                  random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(97238, 6)
(41674, 6)
(97238,)
(41674,)


# Model Building and Evaluation

In [6]:
# Initial Model

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)
print("Train: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test: ", metrics.accuracy_score(y_test, y_test_pred))

Train:  0.9999177276373434
Test:  0.7168018428756538


In [7]:
# Tuning the Hyperparameters

param_grid = {"max_depth": np.arange(1, dt.tree_.max_depth),
             "max_features": [0.2, 0.4, 0.6, 0.8]}
gs = GridSearchCV(dt, param_grid, cv=8)
gs.fit(X_train, y_train)
print("Best Estimator:", gs.best_estimator_)
print("Best Score: ", 1-gs.best_score_)

Best Estimator: DecisionTreeClassifier(max_depth=8, max_features=0.8, random_state=42)
Best Score:  0.19815295101862573


In [8]:
# Final Decision Tree Model

dtFinal = DecisionTreeClassifier(max_depth = 8, max_features = 0.8, random_state =42)
dtFinal.fit(X_train, y_train)
y_train_pred = dtFinal.predict(X_train)
y_test_pred = dtFinal.predict(X_test)
print("Train: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test: ", metrics.accuracy_score(y_test, y_test_pred))

Train:  0.8087270408687961
Test:  0.8013629601190191


In [9]:
# Initial Random Forest Model

rf = RandomForestClassifier(max_depth = 8, max_features = 0.8, random_state =42)
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
print("Train: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test: ", metrics.accuracy_score(y_test, y_test_pred))

Train:  0.815277977745326
Test:  0.8070019676536929


In [10]:
# Tuning the RF Model

param_grid = {"max_depth": np.arange(1, 15)}
gs = GridSearchCV(rf, param_grid, cv=8)
gs.fit(X_train, y_train)
print("Best Estimator:", gs.best_estimator_)
print("Best Score: ", 1-gs.best_score_)

Best Estimator: RandomForestClassifier(max_depth=10, max_features=0.8, random_state=42)
Best Score:  0.19129348579287586


In [11]:
# More tuning

param_grid = {"max_depth": [10], "n_estimators": [10, 50, 100, 200]}
gs = GridSearchCV(rf, param_grid, cv=3)
gs.fit(X_train, y_train)
print("Best Estimator:", gs.best_estimator_)
print("Best Score: ", 1-gs.best_score_)

Best Estimator: RandomForestClassifier(max_depth=10, max_features=0.8, n_estimators=200,
                       random_state=42)
Best Score:  0.19095413494054492


In [12]:
# Still More Tuning

param_grid = {"max_depth": [10], "n_estimators": [200, 350, 500]}
gs = GridSearchCV(rf, param_grid, cv=3)
gs.fit(X_train, y_train)
print("Best Estimator:", gs.best_estimator_)
print("Best Score: ", 1-gs.best_score_)

Best Estimator: RandomForestClassifier(max_depth=10, max_features=0.8, n_estimators=500,
                       random_state=42)
Best Score:  0.19084101033610557


In [13]:
# Final Random Forest Model

rfFinal = RandomForestClassifier(max_depth = 10, max_features = 0.8, 
                                 n_estimators=500, random_state =42)
rfFinal.fit(X_train, y_train)
y_train_pred = rfFinal.predict(X_train)
y_test_pred = rfFinal.predict(X_test)
print("Train: ", metrics.accuracy_score(y_train, y_train_pred))
print("Test: ", metrics.accuracy_score(y_test, y_test_pred))

Train:  0.8275571278718197
Test:  0.8076498536257619


### Discuss about overfitting for both models and, also discuss which model is better for classification for your dataset and why? 

### Decision Tree could be considered the better model because it is simpler. 

### Random Forest produced marginally better as far as the accuracy score. 

### Both models perform similarly considering overfitting/underfitting as long as we classify 0.8 as reasonable.  There is no evidence for overfitting because there is not a big difference between the train and test sets.  We could possibly say that there is underfitting if they expect the rain prediction to be better than 80%.