In [1]:
!unzip "/content/archive (25).zip"

Archive:  /content/archive (25).zip
  inflating: seattle-weather.csv     


In [4]:
# import req lib

import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('/content/seattle-weather.csv')
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [6]:
# Checking the size of the data

df.shape

(1461, 6)

In [7]:
# Checking for dtype

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [8]:
# Checking for null values

df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [9]:
# descriptive stat

df.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [11]:
# remove the un-wanted column

df.drop('date',axis=1,inplace=True)

In [None]:
# df = df.drop('date',axis=1)

In [12]:
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,12.8,5.0,4.7,drizzle
1,10.9,10.6,2.8,4.5,rain
2,0.8,11.7,7.2,2.3,rain
3,20.3,12.2,5.6,4.7,rain
4,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain
1457,1.5,5.0,1.7,1.3,rain
1458,0.0,7.2,0.6,2.6,fog
1459,0.0,5.6,-1.0,3.4,sun


In [19]:
# encoding

df['weather'].unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [20]:
df['weather'].value_counts()

rain       641
sun        640
fog        101
drizzle     53
snow        26
Name: weather, dtype: int64

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder()

In [24]:
df['weather'] = le.fit_transform(df['weather'])

In [25]:
df.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.0,12.8,5.0,4.7,0
1,10.9,10.6,2.8,4.5,2
2,0.8,11.7,7.2,2.3,2
3,20.3,12.2,5.6,4.7,2
4,1.3,8.9,2.8,6.1,2


In [26]:
# spliting the dependent and independent

x = df.drop('weather',axis=1)
y = df['weather']

In [27]:
x

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1
...,...,...,...,...
1456,8.6,4.4,1.7,2.9
1457,1.5,5.0,1.7,1.3
1458,0.0,7.2,0.6,2.6
1459,0.0,5.6,-1.0,3.4


In [28]:
y

0       0
1       2
2       2
3       2
4       2
       ..
1456    2
1457    2
1458    1
1459    4
1460    4
Name: weather, Length: 1461, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=12)

In [31]:
print("Shape of xtrain is {}".format(xtrain.shape))
print("Shape of xtest is {}".format(xtest.shape))
print("Shape of ytrain is {}".format(ytrain.shape))
print("Shape of ytest is {}".format(ytest.shape))

Shape of xtrain is (1022, 4)
Shape of xtest is (439, 4)
Shape of ytrain is (1022,)
Shape of ytest is (439,)


In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# DT

In [33]:
dt = DecisionTreeClassifier()

In [35]:
dt.fit(xtrain,ytrain)

In [36]:
dtYPred = dt.predict(xtest)

In [37]:
from sklearn.metrics import classification_report,confusion_matrix

In [38]:
print(classification_report(ytest,dtYPred))

              precision    recall  f1-score   support

           0       0.10      0.09      0.10        11
           1       0.44      0.29      0.35        38
           2       0.92      0.92      0.92       207
           3       0.62      0.56      0.59         9
           4       0.76      0.83      0.79       174

    accuracy                           0.80       439
   macro avg       0.57      0.54      0.55       439
weighted avg       0.79      0.80      0.79       439



# RF

In [39]:
rf = RandomForestClassifier()

In [40]:
rf.fit(xtrain,ytrain)

In [41]:
rfYPred = rf.predict(xtest)

In [42]:
print(classification_report(ytest,rfYPred))

              precision    recall  f1-score   support

           0       0.20      0.09      0.13        11
           1       0.50      0.11      0.17        38
           2       0.96      0.92      0.94       207
           3       0.83      0.56      0.67         9
           4       0.75      0.95      0.84       174

    accuracy                           0.83       439
   macro avg       0.65      0.52      0.55       439
weighted avg       0.81      0.83      0.81       439



In [43]:
confusion_matrix(ytest,rfYPred)

array([[  1,   0,   1,   0,   9],
       [  1,   4,   1,   0,  32],
       [  0,   0, 191,   1,  15],
       [  0,   0,   4,   5,   0],
       [  3,   4,   2,   0, 165]])

#KNN

In [45]:
knn = KNeighborsClassifier()

In [47]:
knn.fit(xtrain,ytrain)

In [48]:
kYPred = knn.predict(xtest)

In [49]:
print(classification_report(ytest,kYPred))

              precision    recall  f1-score   support

           0       0.11      0.09      0.10        11
           1       0.45      0.13      0.20        38
           2       0.87      0.81      0.84       207
           3       0.67      0.44      0.53         9
           4       0.70      0.89      0.79       174

    accuracy                           0.76       439
   macro avg       0.56      0.47      0.49       439
weighted avg       0.75      0.76      0.74       439



In [50]:
confusion_matrix(ytest,kYPred)

array([[  1,   1,   0,   0,   9],
       [  4,   5,   6,   0,  23],
       [  3,   3, 168,   1,  32],
       [  0,   0,   4,   4,   1],
       [  1,   2,  15,   1, 155]])

In [51]:
import pickle

In [52]:
pickle.dump(rf,open('rfmodel','wb'))