# Rainfall data analysis 

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
import pandas as pd

df = pd.read_csv('weatherAUS.csv')
print(df.isnull().sum()) 

print(df.shape)

#all features and target have nulls, need to remove for machine learning algorithm to function properly 

df.dropna(inplace = True, subset = ['MaxTemp', 'Rainfall', 'Humidity3pm', 'RainToday', 'RainTomorrow'])
print(df.shape)

df['Rain'] = df['RainToday'] == 'Yes' #had to convert to boolean instead of yes/no for it to work. 

df.RainTomorrow.replace(('No', 'Yes'), (0, 1), inplace=True) #converting target to 0/1


X = df[['MaxTemp', 'Rainfall', 'Humidity3pm', 'Rain']].values
y = df['RainTomorrow'].values

print(df.groupby('RainTomorrow').size()) #this indicates a very inbalanced dataset.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)



Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64
(145460, 23)
(137153, 23)
RainTomorrow
0    106884
1     30269
dtype: int64


DecisionTreeClassifier()

## Evaluating decision tree model with 5-fold cross validation 

In [49]:
#Note our minority class is Yes it will rain, this is of interest but since the data is very imbalanced,
#accuracy is worthless here. I will focus on precision and recall

#Note: I initially needed to declare pos_label = 'Yes' in precison and recall functions because it assumes 1 is the positive but 
#I have yes/no so it automatically wouldn't work. I chose 'yes' to be number 1 by doing this. 
#I decided to change yes/no to 0/1 in the end

from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score
import numpy as np 

kf = KFold(n_splits = 5, shuffle = True, random_state = 20)
dt_precisionScore = []
dt_recallScore = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    dt_ypred = dt.predict(X_test)
    dt_precisionScore.append(precision_score(y_test, dt_ypred))
    dt_recallScore.append(recall_score(y_test, dt_ypred))
    
print("Decision tree")
print(50 *'-')
print("precision:", np.mean(dt_precisionScore))
print("recall:", np.mean(dt_recallScore))
print(50*'-')


Decision tree
--------------------------------------------------
precision: 0.5351799838864589
recall: 0.4127460478195607
--------------------------------------------------


# Gini and Entropy 

In [50]:
#default is gini in DecisionTreeClassifier but can set it to entropy as well
#going to compare the two

dt = DecisionTreeClassifier(criterion = 'entropy')

kf = KFold(n_splits = 5, shuffle = True, random_state = 35)
for criterion in ['gini', 'entropy']:
    print("Decision Tree {}".format(criterion))
    dt_precisionScore = []
    dt_recallScore = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        dt = DecisionTreeClassifier(criterion = criterion)
        dt.fit(X_train, y_train)
        dt_ypred = dt.predict(X_test)
        dt_precisionScore.append(precision_score(y_test, dt_ypred ))
        dt_recallScore.append(recall_score(y_test, dt_ypred))
        
    print("precision:", np.mean(dt_precisionScore))
    print("recall:", np.mean(dt_recallScore))
    
#essentially the same between entropy and gini



        
    



Decision Tree gini
precision: 0.5452452923421809
recall: 0.41777731138881496
Decision Tree entropy
precision: 0.5422787014563253
recall: 0.41499696604632763


## Visualising the decision tree 

In [51]:
####IGNORE####

#Creating png image

#from sklearn.tree import export_graphviz 
#import graphviz 

#X = df[['MaxTemp', 'Rainfall', 'Humidity3pm', 'Rain']].values
#y = df['RainTomorrow'].values

#dt = DecisionTreeClassifier()
#dt.fit(X, y)

#dotfile = export_graphviz(dt, feature_names = ['MaxTemp', 'Rainfall', 'Humidity3pm', 'Rain'])
#graph = graphviz.Source(dotfile)
#graph.render(filename = 'treepic', format = 'png', cleanup = True)
#graph

## Grid searching params 

In [52]:
#GridSearchCV has 4 parameters we will look at

#model which is a decision tree classifier 
#param grid: dictionary of param names and values
#which metric to use 
#how many folds for cross validation 

from sklearn.model_selection import GridSearchCV

paramGrid = {
    'max_depth': [10,15,20],
    'min_samples_leaf': [3, 6],
    'max_leaf_nodes': [10,15,25,40]
}

dt = DecisionTreeClassifier()
gridSearch = GridSearchCV(dt, paramGrid, scoring = 'f1', cv = 5)

#fitting gridsearch object to the data 

gridSearch.fit(X,y)

print("best parameters", gridSearch.best_params_)
print("best score", gridSearch.best_score_) #score of best model. 



best parameters {'max_depth': 10, 'max_leaf_nodes': 40, 'min_samples_leaf': 3}
best score 0.522733207389886
