In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt


Daily Weather Data Description


The file daily_weather.csv is a comma-separated file that contains weather data. This data comes from a weather station located in San Diego, California. The weather station is equipped with sensors that capture weather-related measurements such as air temperature, air pressure, and relative humidity. Data was collected for a period of three years, from September 2011 to September 2014, to ensure that sufficient data for different seasons and weather conditions is captured.

In [2]:
data = pd.read_csv("C:\\Users\\GowriPrasanthRamsamy\\prashanth\\DT\\daily_weather.csv")

In [3]:
data.head()


Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
0,0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16
1,1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597
2,2,923.04,60.638,51.0,17.067852,63.7,22.100967,0.0,20.0,8.9,14.46
3,3,920.502751,70.138895,198.832133,4.337363,211.203341,5.190045,0.0,0.0,12.189102,12.742547
4,4,921.16,44.294,277.8,1.85666,136.5,2.863283,8.9,14730.0,92.41,76.74


In [4]:
data.shape

(1095, 11)

In [5]:
data.describe()

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
count,1095.0,1092.0,1090.0,1091.0,1092.0,1092.0,1091.0,1089.0,1092.0,1095.0,1095.0
mean,547.0,918.882551,64.933001,142.235511,5.508284,148.953518,7.019514,0.203079,294.108052,34.241402,35.344727
std,316.243577,3.184161,11.175514,69.137859,4.552813,67.238013,5.598209,1.593952,1598.078779,25.472067,22.524079
min,0.0,907.99,36.752,15.5,0.693451,28.9,1.185578,0.0,0.0,6.09,5.3
25%,273.5,916.55,57.281,65.972506,2.248768,76.553003,3.067477,0.0,0.0,15.092243,17.395
50%,547.0,918.921045,65.715479,166.0,3.871333,177.3,4.943637,0.0,0.0,23.179259,24.38
75%,820.5,921.160073,73.450974,191.0,7.337163,201.233153,8.94776,0.0,0.0,45.4,52.06
max,1094.0,929.32,98.906,343.4,23.554978,312.2,29.84078,24.02,17704.0,92.62,92.25


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   number                  1095 non-null   int64  
 1   air_pressure_9am        1092 non-null   float64
 2   air_temp_9am            1090 non-null   float64
 3   avg_wind_direction_9am  1091 non-null   float64
 4   avg_wind_speed_9am      1092 non-null   float64
 5   max_wind_direction_9am  1092 non-null   float64
 6   max_wind_speed_9am      1091 non-null   float64
 7   rain_accumulation_9am   1089 non-null   float64
 8   rain_duration_9am       1092 non-null   float64
 9   relative_humidity_9am   1095 non-null   float64
 10  relative_humidity_3pm   1095 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 94.2 KB


In [7]:
data[data.isnull().any(axis=1)].head()
#Checking is there exists null values in the dataset or not

Unnamed: 0,number,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm
16,16,917.89,,169.2,2.192201,196.8,2.930391,0.0,0.0,48.99,51.19
111,111,915.29,58.82,182.6,15.613841,189.0,,0.0,0.0,21.5,29.69
177,177,915.9,,183.3,4.719943,189.9,5.346287,0.0,0.0,29.26,46.5
262,262,923.596607,58.380598,47.737753,10.636273,67.145843,13.671423,0.0,,17.990876,16.461685
277,277,920.48,62.6,194.4,2.751436,,3.869906,0.0,0.0,52.58,54.03



Data Cleaning Steps


Data Cleaning process --> As number column contains unique values which can not help us making any decision

In [8]:
del data['number']
#Data Cleaning process-As number column contains unique values which can not help us making any decision

In [9]:
data = data.dropna()
#Removing the rows which contains the NaN values

In [10]:
data.shape
#1095 has been reduced to 1064.We had 31 NaN 

(1064, 10)

In [11]:
c_data = data.copy()
c_data['h_humidity_label'] = (c_data['relative_humidity_3pm'] >20) *1
c_data['h_humidity_label'].head()
#Filter the values which contains more than 20 relative humidity at 3pm.

0    1
1    0
2    0
3    0
4    1
Name: h_humidity_label, dtype: int32

In [12]:
#Making Y as h_humidity to make decisions based on other features.
y = c_data[['h_humidity_label']].copy()
y.head()

Unnamed: 0,h_humidity_label
0,1
1,0
2,0
3,0
4,1


Using 9am Sensor Signals as Features to Predict Humidity at 3pm


Storing all the Morning features other than Humidity at 3 pm in the morning feature

In [13]:
#Storing all the Morning features other than Humidity at 3 pm for predicting it.
Features = ['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am','rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am']

In [14]:
#Making X as features to make decisions based on humidity conditions
x=c_data[Features].copy()
x.columns

Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am'],
      dtype='object')

In [15]:
#Checking Y data
y.columns

Index(['h_humidity_label'], dtype='object')

Perform Test and Train split


By using train_test_split we have split the data into traing dataset and testing datasets.

In [16]:

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.50,random_state=100)

#Randon state just shuffles the data, in our model we will makae random_state as 100




Fit on Train Set


We have made a classifier for making the Decision Tree and to train the data using this classifier

In [29]:
#Fit on Training Set and make maximum leaf node as 20,random state as 0.
#Leaf nodes denotes number of branching and its limited

humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=20,random_state=10)
humidity_classifier.fit(x_train,y_train)


DecisionTreeClassifier(max_leaf_nodes=20, random_state=10)

In [18]:
type(humidity_classifier)


sklearn.tree._classes.DecisionTreeClassifier


Predict on Test Set


Using humidity_classifier we have predicted the value for the X_test and stored it to y_predicted

In [19]:
#Using humidity_classifier we have predicted the value for the x_test and stored it to y_predicted
y_predicted = humidity_classifier.predict(x_test)



In [20]:
y_predicted[:20]

array([1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1])

In [21]:
#Displays Random number because of random state
y_test['h_humidity_label'][:20]

293     1
1012    0
175     1
734     1
719     1
341     1
738     0
1052    0
404     1
590     1
284     1
228     1
383     0
225     1
313     1
597     0
136     0
269     0
120     1
739     1
Name: h_humidity_label, dtype: int32

Measure Accuracy of the Classifier


Checking our accuracy of the model using accuracy_score function from sklearn metrics which in this case is with around 87% accuracy



In [22]:
#accuracy of the model using accuracy_score function from sklearn metrics which in this case is with around 90% accuracy
accuracy_score(y_test,y_predicted)*100



87.03007518796993

In [23]:
print("the accuracy score is 87.0300")

the accuracy score is 87.0300


Entropy Method

In [24]:
#trying out entropy method to observe the difference
humidity_classifier = DecisionTreeClassifier(max_leaf_nodes=20,random_state=10,criterion= 'entropy' )
humidity_classifier.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=20, random_state=10)

Accuracy

In [25]:
#Checking the accuracy of prediction using xtest samples
y_predicted = humidity_classifier.predict(x_test)
accuracy_score(y_test,y_predicted)*100

85.15037593984962

#comparing the scores of gini model and entropy model for given number of nodes and random states, GINI model performs much better than entropy model in determing humidity at 3pm.
#The most important feature in determing the humidity condition is taken from x8 which is humid condition at 9am.