In [1]:
import pandas as pd
import numpy as np

In [3]:
weather_df=pd.read_csv('..\Data Files\CSV files\weatherAUS.csv')

In [4]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [82]:
# We need to drop all rows with missing values on rain today, rain tomorrow as this can affect our model. Reason is that one is our target variable and the other is closely related to it.
weather_df.dropna(subset= ['RainTomorrow'],inplace=True)

In [84]:
#Declaring inputs and outputs
input_cols=(weather_df.columns)[1:-1]
target_col='RainTomorrow'

In [88]:
#Declaring numerical cols and categorical cols
numeric_cols=weather_df[input_cols].select_dtypes(include=np.number).columns.tolist()
categorical_cols=weather_df[input_cols].select_dtypes('object').columns.tolist()

In [90]:
# Check for missing values and Perform imputation
weather_df[numeric_cols].isna().sum()

MinTemp            468
MaxTemp            307
Rainfall             0
Evaporation      59694
Sunshine         66805
WindGustSpeed     9105
WindSpeed9am      1055
WindSpeed3pm      2531
Humidity9am       1517
Humidity3pm       3501
Pressure9am      13743
Pressure3pm      13769
Cloud9am         52625
Cloud3pm         56094
Temp9am            656
Temp3pm           2624
dtype: int64

In [91]:
#Imputing our data
from sklearn.impute import SimpleImputer

In [92]:
imputer=SimpleImputer(strategy='mean')

In [95]:
imputer.fit(weather_df[numeric_cols])

SimpleImputer()

In [96]:
# Divide/Split the data to train,validation and test df frames
year=pd.to_datetime(weather_df.Date).dt.year

train_df=weather_df[year<2015]
val_df=weather_df[year==2015]
test_df=weather_df[year>2015]


In [97]:
train_inputs=train_df[input_cols].copy()
train_targets=train_df[target_col].copy

In [98]:
val_inputs=val_df[input_cols].copy()
val_targets=val_df[target_col].copy()

In [99]:
test_inputs=test_df[input_cols].copy()
test_targets=test_df[target_col].copy()

In [100]:
#To fill the columns with imputed statistics..
train_inputs[numeric_cols]=imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols]=imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols]=imputer.transform(test_inputs[numeric_cols])

In [21]:
#To view imputer statistics
list(imputer.statistics_)

[12.18482386562048,
 23.235120301822324,
 2.349974074310839,
 5.472515506887154,
 7.630539861047281,
 39.97051988882308,
 13.990496092519967,
 18.631140782316862,
 68.82683277087672,
 51.44928834695453,
 1017.6545771543717,
 1015.2579625879797,
 4.431160817585808,
 4.499250233195188,
 16.98706638787991,
 21.69318269001107]

In [101]:
# Check for missing values
val_inputs[numeric_cols].isna().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
dtype: int64

In [102]:
#Scaling Numerical Values
from sklearn.preprocessing import MinMaxScaler

In [103]:
scaler=MinMaxScaler().fit(weather_df[numeric_cols])

In [42]:
train_inputs[numeric_cols]=scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols]=scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols]=scaler.transform(test_inputs[numeric_cols])

In [104]:
#Dealing with missing values
train_inputs[categorical_cols]=train_inputs[categorical_cols].fillna('Unknown')
val_inputs[categorical_cols]=val_inputs[categorical_cols].fillna('Unknown')
test_inputs[categorical_cols]=test_inputs[categorical_cols].fillna('Unknown')

In [105]:
#check out the no of unique values in the categorical columns 
weather_df[categorical_cols].nunique()

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

In [106]:
#Encoding Categorical Columns
from sklearn.preprocessing import OneHotEncoder

In [107]:
encoder=OneHotEncoder(sparse=False,handle_unknown='ignore')

In [108]:
encoder.fit(weather_df[categorical_cols])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [109]:
#We generate columns for each individual category using using get_feature_names
encoded_cols=list(encoder.get_feature_names(categorical_cols))
print(encoded_cols)

['Location_Adelaide', 'Location_Albany', 'Location_Albury', 'Location_AliceSprings', 'Location_BadgerysCreek', 'Location_Ballarat', 'Location_Bendigo', 'Location_Brisbane', 'Location_Cairns', 'Location_Canberra', 'Location_Cobar', 'Location_CoffsHarbour', 'Location_Dartmoor', 'Location_Darwin', 'Location_GoldCoast', 'Location_Hobart', 'Location_Katherine', 'Location_Launceston', 'Location_Melbourne', 'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree', 'Location_MountGambier', 'Location_MountGinini', 'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead', 'Location_NorfolkIsland', 'Location_Nuriootpa', 'Location_PearceRAAF', 'Location_Penrith', 'Location_Perth', 'Location_PerthAirport', 'Location_Portland', 'Location_Richmond', 'Location_Sale', 'Location_SalmonGums', 'Location_Sydney', 'Location_SydneyAirport', 'Location_Townsville', 'Location_Tuggeranong', 'Location_Uluru', 'Location_WaggaWagga', 'Location_Walpole', 'Location_Watsonia', 'Location_Williamtown', 'Loca

In [111]:
train_inputs[encoded_cols]=encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols]=encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols]=encoder.transform(test_inputs[categorical_cols])

In [112]:
#Dropping the textualised columns so that we are left with just numeric data
x_train=train_inputs[numeric_cols+encoded_cols]
x_val=val_inputs[numeric_cols+encoded_cols]
x_test=test_inputs[numeric_cols+encoded_cols]

In [113]:
x_test

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,WindDir3pm_nan,RainToday_No,RainToday_Yes
2498,20.4,37.6,0.0,5.472516,7.63054,54.0,0.0,7.0,46.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2499,20.9,33.6,0.4,5.472516,7.63054,50.0,9.0,17.0,54.0,30.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2500,18.4,23.1,2.2,5.472516,7.63054,48.0,11.0,39.0,62.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2501,17.3,23.7,15.6,5.472516,7.63054,39.0,9.0,17.0,74.0,65.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2502,15.5,22.9,6.8,5.472516,7.63054,31.0,6.0,9.0,92.0,63.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,3.5,21.8,0.0,5.472516,7.63054,31.0,15.0,13.0,59.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
145455,2.8,23.4,0.0,5.472516,7.63054,31.0,13.0,11.0,51.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
145456,3.6,25.3,0.0,5.472516,7.63054,22.0,13.0,9.0,56.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
145457,5.4,26.9,0.0,5.472516,7.63054,37.0,9.0,9.0,53.0,24.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [114]:
#Train and visualizing decision trees
from sklearn.tree import DecisionTreeClassifier 

In [115]:
model=DecisionTreeClassifier(random_state=42)

In [None]:
%%time
model.fit(x_train,train_targets)

In [None]:
#Predicting using the model fitted
train_preds=model.predict(x_train)

In [None]:
#checking the outcome of our prediction
pd.value_counts(train_preds)

In [117]:
#Package for evaluation
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
#checking the probabilities of our predicting outcome
train_preds_probs=model.predict_proba(x_train)

In [None]:
#Evaluating our decision tree model using train_preds
accuracy_score(train_targets,train_preds)


In [None]:
#Evaluating using our x_val data
model.score(x_val,val_targets)

this values is marginally better than the model always predicting 'no' i.e 
val_targets.value_counts()/len(val_targets)

In [None]:
#Visualizing the decision tree learned from training data
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model,feature_names=x_train.columns,max_depth=4,filled=True)

In [None]:
#Let's check the depth of the tree that was created.
model.tree_.max_depth

In [None]:
#Displaying tree as text
tree_text=export_text(model,manx_depth=10,feature_names=list(x_train.columns))

In [None]:
#gini index computations, a decision tree assigns an "importance" value to each feature. These values can be used to interpret the results given by a decision tree.
model.feature_importances_

In [None]:
#Let's turn this into a dataframe and visualize the most important features.
importance_df = pd.DataFrame({
    'feature': x_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
#Hyperparameter Tuning with Random Forests
model=DecisionTreeClassifier(max_depth=3,random_state=42)

In [None]:
model.fit(x_train,train_targets)

Arguments that we cnfigure manually are called hyperparameters.
   max_depth
   max_leaf_nodes

max_depth; by reducing the maximum depth of the decision tree, we can prevent the tree from memorizing all training examples, which may lead to better generalization

In [None]:
#we can compute the accuracy on training and validation sets using model.score

In [None]:
model.score(x_train,train_targets)

In [None]:
def max_depth_error(md):
    model=DecisionTreeClassifier(max_depth=md,random_state=42)
    model.fit(x_train,train_targets)
    train_error=1-model.score(x_train,train_targets)
    val_error=1-model.score(x_val,val_targets)
    return {'Max Depth':md, 'Training Error':train_error,'Valadation Error':val_acc}

In [None]:
%%time
errors_df=pd.DataFrame([max_depth_error(md) for md in range(1,21)])

In [None]:
plt.figure()
plt.plot(errors_df['Max Depth'],errors_df['Training Error'])
plt.plot(errors_df['Max Depth'],errors_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21,2))
plt.xlabel('Max.Depth')
plt.ylabel('Prediction Error(1-Accuracy')
plt.legend(['Training','Validation'])

In [None]:
#max_leaf_nodes
model=DecisionTreeClassifier(max_leaf_nodes=128,random_state=42)
model.fit(x_train,train_targets)

In [None]:
#   cost complexities pruning?