# Weather Classification

## Import the relevant libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set the styles to Seaborn
sns.set()
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [4]:
# Load the data
data = pd.read_csv ('seattle-weather.csv')
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun


In [5]:
lDrop = ['drizzle', 'snow']
data = data[~data['weather'].isin(lDrop)]
data['targets'] = data['weather'].map({'rain':0, 'sun':1, 'fog':2})
data = data.drop(['date', 'weather'], axis=1)
data.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


precipitation    float64
temp_max         float64
temp_min         float64
wind             float64
targets            int64
dtype: object

In [6]:
freq = data['targets'].value_counts() 
print(freq) 

0    641
1    640
2    101
Name: targets, dtype: int64


In [5]:
unscaled_inputs_all = data.iloc[:,:-1]
targets_all = data.iloc[:,-1]

### Balance the Dataset

In [7]:
threeTargNum = 26
# Set a counter for the other targets
zeroTargCount = oneTargCount = twoTargCount = fourTargCount = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
twoeTargNum = 26
# Set a counter for the other targets
zeroTargCount = oneTargCount = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
            if zeroTargCount >= twoeTargNum:
                indices_to_remove.append(i)
            else:
                zeroTargCount += 1
    elif targets_all[i] ==  1:
            if oneTargCount >= twoeTargNum:
                indices_to_remove.append(i)
            else:
                oneTargCount += 1

# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
x = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
y = np.delete(targets_all, indices_to_remove, axis=0)

In [6]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
freq = y.value_counts()
print(freq)

0    641
1    640
2    101
Name: targets, dtype: int64


### Split the dataset into train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state=365)

## Create the Decision Tree

In [8]:
weather_classifier = DecisionTreeClassifier(max_leaf_nodes = 10, random_state = 0)
weather_classifier.fit(X_train, y_train)

DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)

In [9]:
y_predicted = weather_classifier.predict(X_test)

In [10]:
accuracy_score(y_test,y_predicted)*100

89.89169675090253

In [13]:
print(classification_report(y_test, y_predicted, target_names=['rain','sun','fog']))

              precision    recall  f1-score   support

        rain       0.99      0.96      0.97       138
         sun       0.84      0.95      0.89       121
         fog       0.29      0.11      0.16        18

    accuracy                           0.90       277
   macro avg       0.71      0.67      0.68       277
weighted avg       0.88      0.90      0.89       277



In [14]:
confusion_matrix(y_test, y_predicted)

array([[132,   6,   0],
       [  1, 115,   5],
       [  0,  16,   2]], dtype=int64)