## Decision Trees and RandomForest

In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('weather_train.csv')
test = pd.read_csv('weather_test.csv')

In [4]:
X_train = train.drop(columns=['Date', 'RainTomorrow'])
X_test = test.drop(columns=['Date', 'RainTomorrow'])

y_train = train['RainTomorrow']
y_test = test['RainTomorrow']

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def scale(x):
    numeric_cols = list(x.describe().columns)
    scaler.fit(x[numeric_cols])
    scaled_num = scaler.transform(x[numeric_cols])
    return scaled_num

In [5]:
X_train = scale(X_train)
X_test = scale(X_test)

In [28]:
decision_classifier = DecisionTreeClassifier(random_state=23, max_depth= 6)

model = decision_classifier.fit(X_train,y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

In [29]:
test_score = accuracy_score(y_true = y_test, y_pred=test_pred)
train_score = accuracy_score(y_true=y_train, y_pred=train_pred)

In [30]:
print(test_score)
print(train_score)

0.8309296983846067
0.844545553938925


In [34]:
# advance metrics 

print(f'test precision: {precision_score(y_true = y_test, y_pred=test_pred, pos_label="No")}')
print(f'train precision: {precision_score(y_true = y_train, y_pred=train_pred, pos_label="No")}')

print(f'test recall: {recall_score(y_true = y_test, y_pred=test_pred, pos_label="No")}')
print(f'train recall: {recall_score(y_true = y_train, y_pred=train_pred, pos_label="No")}')

print(f'test f1_score: {f1_score(y_true = y_test, y_pred=test_pred, pos_label="No")}')
print(f'train f1_score: {f1_score(y_true = y_train, y_pred=train_pred, pos_label="No")}')

test precision: 0.8522483940042827
train precision: 0.8644726283443916
test recall: 0.9461937800203775
train recall: 0.9501462806006109
test f1_score: 0.8967673702119833
train f1_score: 0.9052869982270775


In [35]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true=y_test, y_pred=test_pred)

array([[19502,  1109],
       [ 3381,  2565]], dtype=int64)