# Task 2.2 - CNN Convulusion Neural Network - CLimate Wins

## This script contains the following:
#### 1. Import Libraries and Data
- tensorflow and keras
- Data: unscaled weather observations, pleasant weather predictions
#### 2. Data Wrangling
-Export adjusted data set to your data folder as a “Cleaned” version
#### 3. Reshaping for modeling
a) Data Split
#### 4. Create Keras Model (CNN)
#### 6. Compile and Run
#### 6. Create Confusion Matrix

## 1. Import Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers
from numpy import unique
from numpy import reshape
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Conv1D, Conv2D, Dense, Dropout, BatchNormalization, Flatten, MaxPooling1D


In [2]:
# import path
path = r'/Users/nancykray/Desktop/CF/Machine Learning /ClimateWins/Data Sets'

In [3]:
# Import unscaled weather observations data (unscaled)

unscaled = pd.read_csv(os.path.join(path, 'DATASET weather_prediction_dataset_processed.csv'), index_col = False)

In [4]:
# Import predictions data (answers)

answers = pd.read_csv(os.path.join(path, 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'), index_col = False)


### UNSCALED:

In [5]:
unscaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [6]:
unscaled.shape

(22950, 170)

### ANSWERS:

In [7]:
answers.head()

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
answers.shape

(22950, 16)

## 2. Data Wrangling
Unscaled Data:

- a) drop date and month
- b) drop 3 weather stations as all observations need to be same length (already removed from answers data)
- c) Two types of observations are missing multiple years for most weather stations. Remove them.
- d) There are three individual observations that need to be filled in. Assume nearby stations have similar weather, then pick one to copy the data from (Ljubljana is near Kassel, Sonnblick is near Munchen, and Olso is close enough to Stockholm).
  
Answers Data:
- e) drop date

When complete, your X shape should be (22950, 135), and your y shape should be (22950, 15).

In [9]:
# a) drop columns, 'date' and 'month' from unscaled data
unscaled.drop(columns=['DATE', 'MONTH'], inplace=True)


In [10]:
# b) Drop 3 weather stations: all columns related to Tours, Gdansk and Rome from the unscaled dataset

unscaled = unscaled.drop(['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                        'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                        'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)

In [11]:
unscaled.columns

Index(['BASEL_cloud_cover', 'BASEL_wind_speed', 'BASEL_humidity',
       'BASEL_pressure', 'BASEL_global_radiation', 'BASEL_precipitation',
       'BASEL_snow_depth', 'BASEL_sunshine', 'BASEL_temp_mean',
       'BASEL_temp_min',
       ...
       'VALENTIA_cloud_cover', 'VALENTIA_humidity', 'VALENTIA_pressure',
       'VALENTIA_global_radiation', 'VALENTIA_precipitation',
       'VALENTIA_snow_depth', 'VALENTIA_sunshine', 'VALENTIA_temp_mean',
       'VALENTIA_temp_min', 'VALENTIA_temp_max'],
      dtype='object', length=147)

In [12]:
missing_counts = unscaled.isnull().sum()
print(missing_counts)


BASEL_cloud_cover         0
BASEL_wind_speed          0
BASEL_humidity            0
BASEL_pressure            0
BASEL_global_radiation    0
                         ..
VALENTIA_snow_depth       0
VALENTIA_sunshine         0
VALENTIA_temp_mean        0
VALENTIA_temp_min         0
VALENTIA_temp_max         0
Length: 147, dtype: int64


In [13]:
# c) in order to find the two types of observations missing multiple years for most weather stations;

# Extract the different observation types

observation_types = ['cloud_cover', 'wind_speed', 'humidity', 'pressure',
                     'global_radiation', 'precipitation', 'snow_depth', 
                     'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [14]:
# c) Create a dictionary to store the count of stations for each observation type
station_counts = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in unscaled.columns if col.endswith(obs)]
    
    # Count the number of stations (i.e., the number of columns) for the current observation type
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations covered by each observation type:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")

Number of stations covered by each observation type:
cloud_cover: 14 stations
wind_speed: 9 stations
humidity: 14 stations
pressure: 14 stations
global_radiation: 15 stations
precipitation: 15 stations
snow_depth: 6 stations
sunshine: 15 stations
temp_mean: 15 stations
temp_min: 15 stations
temp_max: 15 stations


### The two observations with the lowest counts are 'wind_speed' with only 9 stations recorded and 'snow_depth with only 6

In [15]:
# c) Drop columns that end with 'wind_speed' and 'snow_depth'

columns_to_drop = unscaled.filter(regex='(_wind_speed|_snow_depth)$').columns
columns_to_drop

Index(['BASEL_wind_speed', 'BASEL_snow_depth', 'DEBILT_wind_speed',
       'DUSSELDORF_wind_speed', 'DUSSELDORF_snow_depth', 'HEATHROW_snow_depth',
       'KASSEL_wind_speed', 'LJUBLJANA_wind_speed', 'MAASTRICHT_wind_speed',
       'MADRID_wind_speed', 'MUNCHENB_snow_depth', 'OSLO_wind_speed',
       'OSLO_snow_depth', 'SONNBLICK_wind_speed', 'VALENTIA_snow_depth'],
      dtype='object')

In [16]:
# c)
unscaled = unscaled.drop(columns=columns_to_drop)

In [17]:
unscaled.shape 

# c) Correctly dropped total of 15 columns,  but now I have 3 columns short of what is expected (135)

(22950, 132)

In [18]:
# Add three new columns cloud_cover, pressure and humidity for three locations and copy the values from the closest weather stations
unscaled[['KASSEL_cloud_cover', 'MUNCHENB_pressue', 'STOCKHOLM_humidity']]=unscaled[['LJUBLJANA_cloud_cover', 'SONNBLICK_pressure', 'OSLO_humidity']].copy()

In [19]:
unscaled.shape

# ok, this should do it

(22950, 135)

In [20]:
#e) drop the 'DATE' column from answers dataset

answers.drop(columns=['DATE'], inplace=True)


In [21]:
answers.shape

# this is the expected shape of this dataframe!

(22950, 15)

### Export cleaned data sets:
- unscaled_clean (x)
- answers_clean (Y)

In [22]:
# Export cleaned unscaled weather data set
unscaled.to_csv(os.path.join(path, 'Prepared Data','unscaled_cleaned.csv'))


In [23]:
# Export cleaned answers pleasant weather data set
answers.to_csv(os.path.join(path, 'Prepared Data', 'answers_cleaned.csv'))

## 3) Reshaping for Modeling

Ensure the layers can be fed to the deep learning model correctly. You’ll need to split the observations (X) into 15 groups of 9 types of observations, and your labels (y) should also be in 15 groups (it doesn’t need to be transformed or reshaped). The final shapes should be X = (22950, 15, 9) and y = (22950, 15).

When reshaping a 3-D object, you can use the following code X = X.reshape(-1,15,9), where -1 means “the shape that fits with the rest.”

In [3]:
# import clean X dataset
X = pd.read_csv(os.path.join(path, 'Prepared Data', 'unscaled_cleaned.csv'), index_col = 0)

In [4]:
X

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressue,STOCKHOLM_humidity
0,7,0.85,1.0180,0.32,0.09,0.7,6.5,0.8,10.9,1,...,1.0003,0.45,0.34,4.7,8.5,6.0,10.9,8,1.0304,0.98
1,6,0.84,1.0180,0.36,1.05,1.1,6.1,3.3,10.1,6,...,1.0007,0.25,0.84,0.7,8.9,5.6,12.1,6,1.0292,0.62
2,8,0.90,1.0180,0.18,0.30,0.0,8.5,5.1,9.9,6,...,1.0096,0.17,0.08,0.1,10.5,8.1,12.9,8,1.0320,0.69
3,3,0.92,1.0180,0.58,0.00,4.1,6.3,3.8,10.6,8,...,1.0184,0.13,0.98,0.0,7.4,7.3,10.6,6,1.0443,0.98
4,6,0.95,1.0180,0.65,0.14,5.4,3.0,-0.7,6.0,8,...,1.0328,0.46,0.00,5.7,5.7,3.0,8.4,7,1.0430,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,1,0.79,1.0248,1.34,0.22,7.7,15.9,11.4,21.4,2,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,4,1.0263,0.98
22946,6,0.77,1.0244,1.34,0.22,5.4,16.7,14.3,21.9,0,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,1.00
22947,4,0.76,1.0227,1.34,0.22,6.1,16.7,13.1,22.4,2,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,0.85
22948,5,0.80,1.0212,1.34,0.22,5.8,15.4,11.6,21.1,1,...,1.0142,1.13,0.41,3.4,10.7,7.9,13.5,3,1.0263,0.94


In [5]:
# import clean Y dataset
y = pd.read_csv(os.path.join(path, 'Prepared Data', 'answers_cleaned.csv'), index_col = 0)

In [6]:
y

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
X = np.array(X)
y = np.array(y)

In [8]:
 # re-shape 3-D object using the below code, where -1 means "the shape that fits with the rest"

X = X.reshape(-1,15,9)

In [9]:
# check the shape
X

array([[[ 7.0000e+00,  8.5000e-01,  1.0180e+00, ...,  6.5000e+00,
          8.0000e-01,  1.0900e+01],
        [ 1.0000e+00,  8.1000e-01,  1.0195e+00, ...,  3.7000e+00,
         -9.0000e-01,  7.9000e+00],
        [ 4.0000e+00,  6.7000e-01,  1.0170e+00, ...,  2.4000e+00,
         -4.0000e-01,  5.1000e+00],
        ...,
        [ 1.0304e+00,  4.8000e-01,  1.0000e-02, ..., -3.2000e+00,
          5.0000e+00,  1.0114e+00],
        [ 5.0000e-02,  3.2000e-01,  0.0000e+00, ...,  5.0000e+00,
          8.8000e-01,  1.0003e+00],
        [ 4.5000e-01,  3.4000e-01,  4.7000e+00, ...,  8.0000e+00,
          1.0304e+00,  9.8000e-01]],

       [[ 6.0000e+00,  8.4000e-01,  1.0180e+00, ...,  6.1000e+00,
          3.3000e+00,  1.0100e+01],
        [ 6.0000e+00,  8.4000e-01,  1.0172e+00, ...,  2.9000e+00,
          2.2000e+00,  4.4000e+00],
        [ 4.0000e+00,  6.7000e-01,  1.0170e+00, ...,  2.3000e+00,
          1.4000e+00,  3.1000e+00],
        ...,
        [ 1.0292e+00,  2.1000e-01,  6.1000e-01, ..., -

## 3a) Split the data
Remember: The final shapes should be X = (22950, 15, 9) and y = (22950, 15).


In [10]:
X.shape

(22950, 15, 9)

In [11]:
y.shape

(22950, 15)

In [12]:
# Split data into train and test sets (picked random number as examples varied from .2 to 42)

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 2)


In [13]:
#  Print the overall shapes of the training and test sets
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(17212, 15, 9) (17212, 15)
(5738, 15, 9) (5738, 15)


## 4. Create Keras Model (CNN)

In [49]:
epochs = 20
batch_size = 15
n_hidden = 100

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential() #specify the model you want to use
model.add(Conv1D(n_hidden, kernel_size=3, activation='relu', input_shape=(timesteps, input_dim)))    
model.add(Dense(16, activation='relu'))  #should we change this 16 to 1 ?             
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax')) # Options: sigmoid, tanh, softmax, relu
                                                  # for the above softmax, you need a min of 3 for model to work

#note: 2 is the smallest kernel_size you can go, although 3 & 5 are more popular (I used 3)

In [50]:
# to see the model summary
model.summary()

## 6. Compile and Run

In [51]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2)

Epoch 1/20
1148/1148 - 1s - 556us/step - accuracy: 0.1044 - loss: 36423.1562
Epoch 2/20
1148/1148 - 0s - 335us/step - accuracy: 0.1129 - loss: 391211.3438
Epoch 3/20
1148/1148 - 0s - 333us/step - accuracy: 0.1145 - loss: 1290038.7500
Epoch 4/20
1148/1148 - 0s - 334us/step - accuracy: 0.1154 - loss: 2756290.2500
Epoch 5/20
1148/1148 - 0s - 333us/step - accuracy: 0.1120 - loss: 4814095.0000
Epoch 6/20
1148/1148 - 0s - 335us/step - accuracy: 0.1136 - loss: 7543730.0000
Epoch 7/20
1148/1148 - 0s - 333us/step - accuracy: 0.1193 - loss: 10926854.0000
Epoch 8/20
1148/1148 - 0s - 337us/step - accuracy: 0.1160 - loss: 14668865.0000
Epoch 9/20
1148/1148 - 0s - 335us/step - accuracy: 0.1182 - loss: 19490108.0000
Epoch 10/20
1148/1148 - 0s - 336us/step - accuracy: 0.1200 - loss: 24794404.0000
Epoch 11/20
1148/1148 - 0s - 336us/step - accuracy: 0.1158 - loss: 31289754.0000
Epoch 12/20
1148/1148 - 0s - 336us/step - accuracy: 0.1187 - loss: 38690840.0000
Epoch 13/20
1148/1148 - 0s - 363us/step - accu

<keras.src.callbacks.history.History at 0x340c36bd0>

In [60]:
# Define list of stations names

stations = {
0: 'BASEL',
1: 'BELGRADE',
2: 'BUDAPEST',
3: 'DEBILT',
4: 'DUSSELDORF',
5: 'HEATHROW',
6: 'KASSEL',
7: 'LJUBLJANA',
8: 'MAASTRICHT',
9: 'MADRID',
10: 'MUNCHENB',
11: 'OSLO',
12: 'SONNBLICK',
13: 'STOCKHOLM',
14: 'VALENTIA'

}

## 6. Create Confusion Matrix

In [62]:
def confusion_matrix(Y_true, Y_pred):
    Y_true = pd.Series([stations[y] for y in np.argmax(Y_true, axis=1)])
    Y_pred = pd.Series([stations[y] for y in np.argmax(Y_pred, axis=1)])

    return pd.crosstab(Y_true, Y_pred, rownames=['True'], colnames=['Pred'])

In [64]:
# Evaluate

print(confusion_matrix(y_test, model.predict(X_test)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292us/step
Pred        BASEL  BELGRADE  BUDAPEST  DUSSELDORF  HEATHROW  KASSEL  \
True                                                                  
BASEL        1257        24      1491         397         1     154   
BELGRADE       94        20       896          41         0       3   
BUDAPEST       19         2       129          17         0       0   
DEBILT         11         0        50          14         0       0   
DUSSELDORF      7         0        17          10         0       0   
HEATHROW       23         0        38          22         0       0   
KASSEL          0         0        12           3         0       0   
LJUBLJANA      12         0        37           4         0       0   
MAASTRICHT      3         0         3           2         0       0   
MADRID        144         0       154          57         0       1   
MUNCHENB        2         0         9           1         0       0   


### The model is only recognizing 12 of the 15 stations