# Weather Classification

The data for this analysis has been downloades from the kaggle website though different csv files. The main purpose is to discover the relationship between some climatic variables such as temperature, pressure and humidity, in comparison with the type and severity of the weather. The data is cleaned, wrangled and then used to create different dataframes, visualizations and machine learning models. 

## Import the relevant libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import tensorflow as tf

# Set the styles to Seaborn
sns.set()

In [4]:
# Load the data
data = pd.read_csv ('seattle-weather.csv')
data

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain
1457,2015-12-28,1.5,5.0,1.7,1.3,rain
1458,2015-12-29,0.0,7.2,0.6,2.6,fog
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun


In [5]:
lDrop = ['drizzle', 'snow']
data = data[~data['weather'].isin(lDrop)]
targets = data['weather'].map({'rain':0, 'sun':1, 'fog':2})
data = data.drop(['date', 'weather'], axis=1)
data['targets'] = targets
#data['targets'] = targets.astype(float)
data.dtypes

precipitation    float64
temp_max         float64
temp_min         float64
wind             float64
targets            int64
dtype: object

In [6]:
freq = data['targets'].value_counts() 
print(freq) 

0    641
1    640
2    101
Name: targets, dtype: int64


In [7]:
data.to_csv("weather_data.csv", header=False, index=False)

In [8]:
raw_csv_data = np.loadtxt('weather_data.csv',delimiter=',', usecols=range(5))
raw_csv_data

array([[10.9, 10.6,  2.8,  4.5,  0. ],
       [ 0.8, 11.7,  7.2,  2.3,  0. ],
       [20.3, 12.2,  5.6,  4.7,  0. ],
       ...,
       [ 0. ,  7.2,  0.6,  2.6,  2. ],
       [ 0. ,  5.6, -1. ,  3.4,  1. ],
       [ 0. ,  5.6, -2.1,  3.5,  1. ]])

In [9]:
unscaled_inputs_all = raw_csv_data[:,:-1]
targets_all = raw_csv_data[:,-1]

### Balance the Dataset

In [10]:
twoeTargNum = 26
# Set a counter for the other targets
zeroTargCount = oneTargCount = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
            if zeroTargCount >= twoeTargNum:
                indices_to_remove.append(i)
            else:
                zeroTargCount += 1
    elif targets_all[i] ==  1:
            if oneTargCount >= twoeTargNum:
                indices_to_remove.append(i)
            else:
                oneTargCount += 1

# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

In [11]:
unscaled_inputs_equal_priors = unscaled_inputs_all
targets_equal_priors = targets_all

In [12]:
unique, counts = np.unique(targets_equal_priors, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  0. 641.]
 [  1. 640.]
 [  2. 101.]]


### Standardize the inputs

In [13]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

### Shuffle the data

In [14]:
# When the data was collected it was actually arranged by date
# Shuffle the indices of the data, so the data is not arranged in any way when we feed it.
# Since we will be batching, we want the data to be as randomly spread out as possible
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Split the dataset into train, validation, and test

In [15]:
# Count the total number of samples
samples_count = shuffled_inputs.shape[0]

# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first "train_samples_count" observations
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next "validation_samples_count" observations, folllowing the "train_samples_count" we already assigned
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test.
# They are everything that is remaining.
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were 
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code, 
# you will get different values, as each time they are shuffled randomly.
# Normally you preprocess ONCE, so you need not rerun this code once it is done.
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.

### Save the three datasets in *.npz

In [16]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!

np.savez('Weather_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Weather_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Weather_data_test', inputs=test_inputs, targets=test_targets)

## Create the machine learning algorithm

### Data

In [17]:
# let's create a temporary variable npz, where we will store each of the three Audiobooks datasets
npz = np.load('Weather_data_train.npz')

# we extract the inputs using the keyword under which we saved them
# to ensure that they are all floats, let's also take care of that
train_inputs = npz['inputs'].astype(np.float)
# targets must be int because of sparse_categorical_crossentropy (we want to be able to smoothly one-hot encode them)
train_targets = npz['targets'].astype(np.int)

# we load the validation data in the temporary variable
npz = np.load('Weather_data_validation.npz')
# we can load the inputs and the targets in the same line
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

# we load the test data in the temporary variable
npz = np.load('Weather_data_test.npz')
# we create 2 variables that will contain the test inputs and the test targets
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_inputs = npz['inputs'].astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  train_targets = npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np

### Model
Outline, optimizers, loss, early stopping and training

In [23]:
# Set the input and output sizes
input_size = 4
output_size = 3
# Use same hidden layer size for both hidden layers. Not a necessity.
hidden_layer_size = 100
    
# define how the model will look like
model = tf.keras.Sequential([
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    # the final layer is no different, we just make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

# set the batch size
batch_size = 10

num_classes = 3

### Choose the optimizer and the loss function

# we define the optimizer we'd like to use, 
# the loss function, 
# and the metrics we are interested in obtaining at each iteration
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = 'accuracy')
### Training
# That's where we train the model we have built.

# set a maximum number of training epochs
max_epochs = 100

# set an early stopping mechanism
# let's set patience=2, to be a bit tolerant against random validation loss increases
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

# fit the model
# note that this time the train, validation and test data are not iterable
model.fit(train_inputs, # train inputs
          train_targets, # train targets
          batch_size=batch_size, # batch size
          epochs=max_epochs, # epochs that we will train for (assuming early stopping doesn't kick in)
          # callbacks are functions called by a task when a task is completed
          # task here is to check if val_loss is increasing
          callbacks=[early_stopping], # early stopping
          validation_data=(validation_inputs, validation_targets), # validation data
          verbose = 1 # making sure we get enough information about the training process
          )  

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<tensorflow.python.keras.callbacks.History at 0x29afccfd160>

## Test the model

In [24]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [25]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.45. Test accuracy: 86.33%
