In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('daily_weather.csv')

In [3]:
data

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,high_humidity_3pm
0,918.060000,74.822000,271.100000,2.080354,295.400000,2.863283,0.0,0.0,42.420000,1
1,917.347688,71.403843,101.935179,2.443009,140.471549,3.533324,0.0,0.0,24.328697,0
2,923.040000,60.638000,51.000000,17.067852,63.700000,22.100967,0.0,20.0,8.900000,0
3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,0
4,921.160000,44.294000,277.800000,1.856660,136.500000,2.863283,8.9,14730.0,92.410000,1
...,...,...,...,...,...,...,...,...,...,...
1090,918.900000,63.104000,192.900000,3.869906,207.300000,5.212070,0.0,0.0,26.020000,1
1091,918.710000,49.568000,241.600000,1.811921,227.400000,2.371156,0.0,0.0,90.350000,1
1092,916.600000,71.096000,189.300000,3.064608,200.800000,3.892276,0.0,0.0,45.590000,1
1093,912.600000,58.406000,172.700000,3.825167,189.100000,4.764682,0.0,0.0,64.840000,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   air_pressure_9am        1092 non-null   float64
 1   air_temp_9am            1090 non-null   float64
 2   avg_wind_direction_9am  1091 non-null   float64
 3   avg_wind_speed_9am      1092 non-null   float64
 4   max_wind_direction_9am  1092 non-null   float64
 5   max_wind_speed_9am      1091 non-null   float64
 6   rain_accumulation_9am   1089 non-null   float64
 7   rain_duration_9am       1092 non-null   float64
 8   relative_humidity_9am   1095 non-null   float64
 9   high_humidity_3pm       1095 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 85.7 KB


In [5]:
data.shape

(1095, 10)

In [6]:
data.isnull().sum()

air_pressure_9am          3
air_temp_9am              5
avg_wind_direction_9am    4
avg_wind_speed_9am        3
max_wind_direction_9am    3
max_wind_speed_9am        4
rain_accumulation_9am     6
rain_duration_9am         3
relative_humidity_9am     0
high_humidity_3pm         0
dtype: int64

In [7]:
# We found small number of missing values we found so dropping the null values
data.dropna(inplace=True) 

In [8]:
data.isnull().sum()

air_pressure_9am          0
air_temp_9am              0
avg_wind_direction_9am    0
avg_wind_speed_9am        0
max_wind_direction_9am    0
max_wind_speed_9am        0
rain_accumulation_9am     0
rain_duration_9am         0
relative_humidity_9am     0
high_humidity_3pm         0
dtype: int64

In [9]:
data.duplicated().sum()

1

In [10]:
data[data.duplicated()]

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,high_humidity_3pm
543,921.65,62.87,56.7,10.714943,77.3,13.712442,0.0,0.0,21.11,0


In [11]:
data.drop_duplicates(inplace=True)

In [12]:
data[data.duplicated()]

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,high_humidity_3pm


In [13]:
data.duplicated().sum()

0

In [14]:
data['high_humidity_3pm'].value_counts()

high_humidity_3pm
0    534
1    529
Name: count, dtype: int64

# The above method given that most balanced data we have.

## Machine Learning Process 

In [15]:
X = data.drop(columns = 'high_humidity_3pm')
y = data['high_humidity_3pm']

In [16]:
## Train_test_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Apply Decision Tree on the Data

In [17]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

In [18]:
y_pred = dec_tree.predict(X_test)

In [19]:
y_pred

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [20]:
y_test

178     1
1012    0
704     1
533     1
882     0
       ..
842     0
772     0
913     0
506     1
467     1
Name: high_humidity_3pm, Length: 213, dtype: int64

In [21]:
accuracy_score(y_test, y_pred)

0.8873239436619719

## Check for Overfitting

In [22]:
y_pred_train = dec_tree.predict(X_train)

In [24]:
accuracy_score(y_pred_train, y_train)

1.0

1. This above score should never be 100% / 1.0
2. If training score is above 80%, and difference in training and test score is more than 10% then we call this as overfitting.

1. If the difference is around 10% then we can say it as not a over fitting
2. In above case the difference is 12% (1.0 - 0.88)

## Apply Decision tree to avoid overfitting

In [27]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 4)
dec_tree.fit(X_train, y_train)

In [28]:
y_pred = dec_tree.predict(X_test)

In [29]:
accuracy_score(y_test, y_pred)

0.9061032863849765

In [30]:
y_pred_train = dec_tree.predict(X_train)

In [31]:
accuracy_score(y_pred_train, y_train)

0.9011764705882352

## Try max depth = 5

In [34]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 5)
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [35]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9294117647058824

Try Max Depth = 6

In [36]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 6)
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [37]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9423529411764706

## Try Max Depth = 7

In [38]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 7)
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [39]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9647058823529412

# Try Max Depth = 8

In [40]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 8)
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [41]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9729411764705882

Try Max Depth = 9


In [42]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 9)
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [43]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9870588235294118

##Max_features

In [45]:
dec_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 6, max_features= 'sqrt')
dec_tree.fit(X_train, y_train)
accuracy_score(y_test, y_pred)

0.9061032863849765

In [46]:
y_pred_train = dec_tree.predict(X_train)
accuracy_score(y_pred_train, y_train)

0.9305882352941176

##Dictionaries

In [61]:
param_grid = { 'criterion': ['gini', 'entropy'],
              'max_depth' : [4,5,6,7,8,9,10],
              'max_features' : ['sqrt', 'log2', 15, 20]
}

In [62]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(dec_tree, param_grid)