In [1]:
#importing all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
#importing the data
df = pd.read_csv('seattle-weather.csv')

In [3]:
#viewing the head of the data
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [4]:
#getting the info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [5]:
#checking for any null values
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [6]:
#checking all the unique data in the weather column
df['weather'].unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [7]:
#performing featuring engineering on the predictions data to have numeric values to insert into the model
weather_values = {'drizzle':1,'rain':2,'sun':3,'snow':4,'fog':5}

In [8]:
#dropping the date column 
df.drop('date', inplace=True,axis=1)

In [9]:
#creating a new column from the new features
df['nweather']= df['weather'].map(weather_values)
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,nweather
0,0.0,12.8,5.0,4.7,drizzle,1
1,10.9,10.6,2.8,4.5,rain,2
2,0.8,11.7,7.2,2.3,rain,2
3,20.3,12.2,5.6,4.7,rain,2
4,1.3,8.9,2.8,6.1,rain,2
...,...,...,...,...,...,...
1456,8.6,4.4,1.7,2.9,rain,2
1457,1.5,5.0,1.7,1.3,rain,2
1458,0.0,7.2,0.6,2.6,fog,5
1459,0.0,5.6,-1.0,3.4,sun,3


In [10]:
#dropping the weather column
df.drop('weather', axis=1, inplace=True)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#Assigning the input variable to X
X = df.drop('nweather', axis=1)

In [13]:
#Assigning the output variable to y
y = df['nweather']

In [14]:
#Assigning the train and test data to the input and output variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit_transform(X_train,y_train)

array([[ 0.46133235, -1.56004122, -1.09616752,  3.02943991],
       [-0.15455155, -1.18016631, -1.09616752, -1.70932744],
       [-0.45498272,  1.00411442,  1.00477618, -0.73369887],
       ...,
       [-0.37987493, -0.12194335, -0.21577207, -0.80338663],
       [-0.45498272, -1.47863945, -1.85650905,  0.10255419],
       [-0.45498272, -1.26156808, -0.99612258, -0.94276214]])

# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
#creating an object for the linear regression model
model = LinearRegression()

In [20]:
#fitting the data to the model
model.fit(X_train,y_train)

In [21]:
#predicting the output of the data
pred = model.predict(X_test).round()

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

In [23]:
print(classification_report(y_test,pred))
print(confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.88      0.60      0.72       192
           3       0.61      0.96      0.74       193
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00        32

    accuracy                           0.69       439
   macro avg       0.30      0.31      0.29       439
weighted avg       0.65      0.69      0.64       439

[[  0   1  13   0   0]
 [  2 116  74   0   0]
 [  0   8 185   0   0]
 [  0   4   4   0   0]
 [  0   3  29   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
#creating an object for the logistic regression model
Lmodel = LogisticRegression()

In [26]:
#fitting the data to the model
Lmodel.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
#predicting the results of the model
Lpred=Lmodel.predict(X_test).round()

In [31]:
#printing out the classification report and confusion matrix of the model
print(classification_report(y_test,Lpred))
print(confusion_matrix(y_test,Lpred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        14
           2       0.96      0.92      0.94       192
           3       0.76      1.00      0.86       193
           4       1.00      0.12      0.22         8
           5       0.00      0.00      0.00        32

    accuracy                           0.85       439
   macro avg       0.54      0.41      0.41       439
weighted avg       0.77      0.85      0.80       439

[[  0   0  14   0   0]
 [  0 177  15   0   0]
 [  0   0 193   0   0]
 [  0   7   0   1   0]
 [  0   0  32   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
dtree = DecisionTreeClassifier()

In [34]:
dtree.fit(X_train,y_train)

In [36]:
dPred=dtree.predict(X_test)

In [38]:
print(classification_report(y_test,dPred))
print(confusion_matrix(y_test,dPred))

              precision    recall  f1-score   support

           1       0.04      0.07      0.05        14
           2       0.88      0.90      0.89       192
           3       0.79      0.72      0.75       193
           4       0.22      0.25      0.24         8
           5       0.27      0.28      0.28        32

    accuracy                           0.73       439
   macro avg       0.44      0.44      0.44       439
weighted avg       0.76      0.73      0.74       439

[[  1   1   9   0   3]
 [  0 172  11   7   2]
 [ 22  14 138   0  19]
 [  0   6   0   2   0]
 [  3   3  17   0   9]]
