In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [5]:
# Reading Data
data = pd.read_csv('daily_weather.csv')

In [7]:
data

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.060000,74.822000,271.100000,2.080354,295.400000,2.863283,0.0,0.0,42.420000,36.160000
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.040000,60.638000,51.000000,17.067852,63.700000,22.100967,0.0,20.0,8.900000,14.460000
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.160000,44.294000,277.800000,1.856660,136.500000,2.863283,8.9,14730.0,92.410000,76.740000
...,...,...,...,...,...,...,...,...,...,...,...
1090,1090,918.900000,63.104000,192.900000,3.869906,207.300000,5.212070,0.0,0.0,26.020000,38.180000
1091,1091,918.710000,49.568000,241.600000,1.811921,227.400000,2.371156,0.0,0.0,90.350000,73.340000
1092,1092,916.600000,71.096000,189.300000,3.064608,200.800000,3.892276,0.0,0.0,45.590000,52.310000
1093,1093,912.600000,58.406000,172.700000,3.825167,189.100000,4.764682,0.0,0.0,64.840000,58.280000


DATA DESCRIPTION:ION:


In [11]:
# Taking Column Names
data.columns

Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')

In [13]:
# Viewing the data
data.head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [17]:
# Checking the shape of data - record numbers x parameters
data.shape

(1095, 11)

In [21]:
# Checking is there any 'null' datas in our dataset
data[data.isnull().any(axis=1)].head()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
16,16,917.89,,169.2,2.192201,196.8,2.930391,0.0,0.0,48.99,51.19
111,111,915.29,58.82,182.6,15.613841,189.0,,0.0,0.0,21.5,29.69
177,177,915.9,,183.3,4.719943,189.9,5.346287,0.0,0.0,29.26,46.5
262,262,923.596607,58.380598,47.737753,10.636273,67.145843,13.671423,0.0,,17.990876,16.461685
277,277,920.48,62.6,194.4,2.751436,,3.869906,0.0,0.0,52.58,54.03


DATA CLEANING AND FEATURE ENGINEERING STEPS

In [26]:
del data['number'] # Removing number (index) column

In [28]:
data = data.dropna() # Dropping rows where data has null values

In [30]:
# Checking the shape of data, after deleting the rows which they had null values
data.shape

(1064, 10)

In [40]:
# Making addition column for high humidity
# Backing up the data for cleaning purpose
clean_data = data.copy()

# Defining high humidity as '28 degrees or more' at 3 pm
clean_data['high_humidity_label'] = (clean_data['relative_humidity_3pm'] > 28) *1 # if its high, then its going to be 1, else 0
clean_data['high_humidity_label'].head()

0    1
1    0
2    0
3    0
4    1
Name: high_humidity_label, dtype: int32

In [44]:
# Making the target variable of the data for modelling
y = clean_data[['high_humidity_label']].copy()
y.head()

Unnamed: 0,high_humidity_label
0,1
1,0
2,0
3,0
4,1


STORING ALL THE MORNING FEATURES OTHER THAN HUMIDITY AT 3PM IN THE MORNING FEATURE

In [47]:
# Making a list of all independent features apart from 'relative humidity at 3pm'
morning_features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       ]

In [49]:
# Copying the vales from the clean_data dataset to new dataset which only contains the morning feature data
x = clean_data[morning_features].copy()
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [51]:
# The target columns
y.columns

Index(['high_humidity_label'], dtype='object')

MODEL TRAINING

In [54]:
# Splitting the data as test and train
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=324)

We have made a classifier for making the Decision Tree and to train the data with this classifier

In [57]:
# Fitting decision tree model on data
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes = 10, random_state=0)
humidity_classifier.fit(X_train, y_train)

In [59]:
# Predicting values on Test set
y_predicted = humidity_classifier.predict(X_test)

In [61]:
# Checking the test accuracy
accuracy_score(y_test, y_predicted)

0.8892045454545454

In [63]:
# Confusion Matrix
confusion_matrix(y_test, y_predicted)

array([[166,  14],
       [ 25, 147]], dtype=int64)