# TASK 2.2 Keras CNN

#### Importing libraries and data
#### Import unscaled data and pleasant weather sets
#### Data wrangling
#### Reshaping for modeling
#### Data Split
#### Creating Keras model
#### Compiling and running
#### Creating Confusion Matrix

## Importing libraries

In [4]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from numpy import unique
from numpy import reshape
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D, Dropout
from keras.utils import to_categorical

In [5]:
#Define path
path = r'C:\Users\melan\OneDrive\Career Foundry\Machine Learning with Python\Data Sets'

In [6]:
path

'C:\\Users\\melan\\OneDrive\\Career Foundry\\Machine Learning with Python\\Data Sets'

In [7]:
#Import unscaled data
unscaled = pd.read_csv(os.path.join(path, 'Dataset-weather-prediction-dataset-processed.csv'))

In [8]:
# Import pleasant weather answers csv file
pleasant_weather = pd.read_csv(os.path.join(path,'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'), index_col = False)

In [9]:
unscaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [10]:
unscaled.shape

(22950, 170)

In [11]:
pleasant_weather.head()

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
pleasant_weather.shape

(22950, 16)

## Data Wrangling

In [14]:
#Ensure the weather data is in a structured format with the correct shape to feed to the deep learning model
#Drop 3 weather stations not included in answers (GDANSK, ROMA, TOURS)
unscaled = unscaled.drop(['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                        'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                        'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)

In [15]:
unscaled.columns

Index(['DATE', 'MONTH', 'BASEL_cloud_cover', 'BASEL_wind_speed',
       'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation',
       'BASEL_precipitation', 'BASEL_snow_depth', 'BASEL_sunshine',
       ...
       'VALENTIA_cloud_cover', 'VALENTIA_humidity', 'VALENTIA_pressure',
       'VALENTIA_global_radiation', 'VALENTIA_precipitation',
       'VALENTIA_snow_depth', 'VALENTIA_sunshine', 'VALENTIA_temp_mean',
       'VALENTIA_temp_min', 'VALENTIA_temp_max'],
      dtype='object', length=149)

In [16]:
#We should be showing 21 fewer columns now
unscaled.shape

(22950, 149)

In [17]:
# Extract observation types so we can see where there is missing data
observation_types = ['cloud_cover', 'wind_speed', 'humidity', 'pressure', 'global_radiation', 'precipitation', 'snow_depth', 'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [18]:
# Create a dictionary to store the count of stations for each observation type
station_counts = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in unscaled.columns if col.endswith(obs)]
    
    # Count the number of stations (i.e., the number of columns) for the current observation type
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations covered by each observation type:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")

Number of stations covered by each observation type:
cloud_cover: 14 stations
wind_speed: 9 stations
humidity: 14 stations
pressure: 14 stations
global_radiation: 15 stations
precipitation: 15 stations
snow_depth: 6 stations
sunshine: 15 stations
temp_mean: 15 stations
temp_min: 15 stations
temp_max: 15 stations


In [19]:
#Above we can see wind_speed only has 9 stations and snow_depth only has 6!
#Let's drop those columns using the 'or' operator as there are multiple entries missing here
#View the dropped columns
dropped_columns = unscaled.filter(regex = '(_wind_speed|_snow_depth)$').columns
dropped_columns

Index(['BASEL_wind_speed', 'BASEL_snow_depth', 'DEBILT_wind_speed',
       'DUSSELDORF_wind_speed', 'DUSSELDORF_snow_depth', 'HEATHROW_snow_depth',
       'KASSEL_wind_speed', 'LJUBLJANA_wind_speed', 'MAASTRICHT_wind_speed',
       'MADRID_wind_speed', 'MUNCHENB_snow_depth', 'OSLO_wind_speed',
       'OSLO_snow_depth', 'SONNBLICK_wind_speed', 'VALENTIA_snow_depth'],
      dtype='object')

In [20]:
#Now let's actually drop those columns from the unscaled data set
unscaled = unscaled.drop(columns = dropped_columns)

In [21]:
#there should be 15 fewer columns as we dropped 9 wind_speed and 6 snow_depth
unscaled.shape

(22950, 134)

In [22]:
#Some observations were showing only 14 instead of 15 stations - cloud_cover, humidity & pressure
#let's list JUST the station names
all_stations = set([col.split('_')[0] for col in unscaled.columns if '_' in col])
all_stations

{'BASEL',
 'BELGRADE',
 'BUDAPEST',
 'DEBILT',
 'DUSSELDORF',
 'HEATHROW',
 'KASSEL',
 'LJUBLJANA',
 'MAASTRICHT',
 'MADRID',
 'MUNCHENB',
 'OSLO',
 'SONNBLICK',
 'STOCKHOLM',
 'VALENTIA'}

In [23]:
#let's deal with those 3 observation types that only have 14 instead of 15 stations
observation_types = ['cloud_cover', 'humidity', 'pressure']

missing_stations_by_observation = {}

for obs in observation_types:
    # Select columns related to the current observation type
    columns = [col for col in unscaled.columns if col.endswith(obs)]
    
    # Extract station names by removing the observation type from the column names
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    
    # Identify stations that are in all_stations but missing from the current observation type
    missing_stations = all_stations - station_names
    
    # Store the missing station names in the dictionary
    missing_stations_by_observation[obs] = missing_stations

# Print the missing station names for each observation type
for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")


Stations missing from cloud_cover:
KASSEL

Stations missing from humidity:
STOCKHOLM

Stations missing from pressure:
MUNCHENB


In [24]:
#There are 3 observation stations that need to be filled in, we'll use the closest locations to each as recommended in the Task
#Kassel - Ljubljana
#Stockholm - Oslo
#Munchenb - Sonnblick
unscaled.columns.get_loc('LJUBLJANA_cloud_cover')

64

In [25]:
#for Stockholm - Oslo
unscaled.columns.get_loc('OSLO_humidity')

100

In [26]:
#for Munchenb - Sonnblick
unscaled.columns.get_loc('SONNBLICK_pressure')

110

In [27]:
# Now insert at the specified location
unscaled.insert(113, 'MUNCHENB_pressure', unscaled['SONNBLICK_pressure'])

# Now insert at specified location
unscaled.insert(102, 'STOCKHOLM_humidity', unscaled['OSLO_humidity'])

# Now insert at specified locaiton
unscaled.insert(65, 'KASSEL_cloud_cover', unscaled['LJUBLJANA_cloud_cover']) 

In [28]:
#We need THIS for 2.3... The dropped columns, but WITH the DATE! So let's export a version so we can use it
unscaled.to_csv(os.path.join(path, 'unscaled_withdate.csv'), index = False)

In [29]:
unscaled.shape

(22950, 137)

In [30]:
#Drop Date and Month columns
unscaled.drop(['DATE', 'MONTH'], axis = 1, inplace = True)

In [31]:
#we'll make sure this has the correct shape of 22950, 135 (this is the X shape)
unscaled.shape

(22950, 135)

In [32]:
pleasant_weather.drop(columns = 'DATE', inplace = True)

In [33]:
#we'll make sure the pleasant weather (predictions) (answers) dataset has a shape of 22950, 15 (this is the y shape)
pleasant_weather.shape

(22950, 15)

In [34]:
#Excellent! Let's export it!
unscaled.to_csv(os.path.join(path, 'unscaled_CNN.csv'), index = False)

In [35]:
#and the Y shape/pleasant weather /answers/predictions
pleasant_weather.to_csv(os.path.join(path, 'pleasant_weather_CNN.csv'), index = False)

## Reshaping for modeling

##### Ensure the layers can be fed to the deep learning model correctly.
##### You’ll need to split the observations (X) into 15 groups of 9 types of observations,
##### and your labels (y) should also be in 15 groups (it doesn’t need to be transformed or reshaped).
##### The final shapes should be X = (22950, 15, 9) and y = (22950, 15).

In [38]:
X = pd.read_csv(os.path.join(path, 'unscaled_CNN.csv'), index_col = False)

In [39]:
X

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,7,0.85,1.0180,0.32,0.09,0.7,6.5,0.8,10.9,1,...,4.9,5,0.88,1.0003,0.45,0.34,4.7,8.5,6.0,10.9
1,6,0.84,1.0180,0.36,1.05,1.1,6.1,3.3,10.1,6,...,5.0,7,0.91,1.0007,0.25,0.84,0.7,8.9,5.6,12.1
2,8,0.90,1.0180,0.18,0.30,0.0,8.5,5.1,9.9,6,...,4.1,7,0.91,1.0096,0.17,0.08,0.1,10.5,8.1,12.9
3,3,0.92,1.0180,0.58,0.00,4.1,6.3,3.8,10.6,8,...,2.3,7,0.86,1.0184,0.13,0.98,0.0,7.4,7.3,10.6
4,6,0.95,1.0180,0.65,0.14,5.4,3.0,-0.7,6.0,8,...,4.3,3,0.80,1.0328,0.46,0.00,5.7,5.7,3.0,8.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,1,0.79,1.0248,1.34,0.22,7.7,15.9,11.4,21.4,2,...,14.2,5,0.82,1.0142,1.13,0.41,3.4,10.7,7.9,13.5
22946,6,0.77,1.0244,1.34,0.22,5.4,16.7,14.3,21.9,0,...,14.3,5,0.82,1.0142,1.13,0.41,3.4,10.7,7.9,13.5
22947,4,0.76,1.0227,1.34,0.22,6.1,16.7,13.1,22.4,2,...,14.4,5,0.82,1.0142,1.13,0.41,3.4,10.7,7.9,13.5
22948,5,0.80,1.0212,1.34,0.22,5.8,15.4,11.6,21.1,1,...,12.4,5,0.82,1.0142,1.13,0.41,3.4,10.7,7.9,13.5


In [40]:
y = pd.read_csv(os.path.join(path, 'pleasant_weather_CNN.csv'), index_col = False)

In [41]:
y

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22946,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22947,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
# Turn X and y from a dataframe to arrays
X = np.array(X)
y = np.array(y)

In [43]:
#When reshaping a 3-D object, you can use the following code X = X.reshape(-1,15,9), where -1 means “the shape that fits with the rest.”
X = X.reshape(-1,15,9)

In [44]:
# Check the shape/array
X

array([[[  7.    ,   0.85  ,   1.018 , ...,   6.5   ,   0.8   ,
          10.9   ],
        [  1.    ,   0.81  ,   1.0195, ...,   3.7   ,  -0.9   ,
           7.9   ],
        [  4.    ,   0.67  ,   1.017 , ...,   2.4   ,  -0.4   ,
           5.1   ],
        ...,
        [  4.    ,   0.73  ,   1.0304, ...,   2.3   ,  -5.9   ,
          -8.5   ],
        [ -3.2   ,   5.    ,   1.0114, ...,   4.2   ,   2.2   ,
           4.9   ],
        [  5.    ,   0.88  ,   1.0003, ...,   8.5   ,   6.    ,
          10.9   ]],

       [[  6.    ,   0.84  ,   1.018 , ...,   6.1   ,   3.3   ,
          10.1   ],
        [  6.    ,   0.84  ,   1.0172, ...,   2.9   ,   2.2   ,
           4.4   ],
        [  4.    ,   0.67  ,   1.017 , ...,   2.3   ,   1.4   ,
           3.1   ],
        ...,
        [  6.    ,   0.97  ,   1.0292, ...,   0.    ,  -9.5   ,
         -10.5   ],
        [ -8.5   ,   5.    ,   1.0114, ...,   4.    ,   3.    ,
           5.    ],
        [  7.    ,   0.91  ,   1.0007, ...,   8.

### Data Split

In [46]:
# Split data in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [47]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(17212, 15, 9) (17212, 15)
(5738, 15, 9) (5738, 15)


### Create Keras Model

In [49]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten, Input

epochs = 10
batch_size = 32
n_hidden = 32

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Input(shape=(timesteps, input_dim)))  # Define input shape here
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax'))  # use 'softmax' for multiclass classification

In [50]:
model.summary()

### Compiling and Running

In [52]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [53]:
model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 2)

Epoch 1/10
538/538 - 4s - 7ms/step - accuracy: 0.1077 - loss: 1095.0583
Epoch 2/10
538/538 - 2s - 3ms/step - accuracy: 0.1391 - loss: 10852.1816
Epoch 3/10
538/538 - 2s - 3ms/step - accuracy: 0.1385 - loss: 37417.6328
Epoch 4/10
538/538 - 2s - 3ms/step - accuracy: 0.1358 - loss: 84776.1797
Epoch 5/10
538/538 - 2s - 3ms/step - accuracy: 0.1375 - loss: 156598.0625
Epoch 6/10
538/538 - 2s - 3ms/step - accuracy: 0.1326 - loss: 250270.2812
Epoch 7/10
538/538 - 2s - 3ms/step - accuracy: 0.1311 - loss: 367640.2812
Epoch 8/10
538/538 - 2s - 3ms/step - accuracy: 0.1284 - loss: 533665.4375
Epoch 9/10
538/538 - 2s - 3ms/step - accuracy: 0.1240 - loss: 697336.0000
Epoch 10/10
538/538 - 2s - 4ms/step - accuracy: 0.1238 - loss: 905740.1250


<keras.src.callbacks.history.History at 0x118341f2450>

### Create Confusion Matrix

In [55]:
#Evaluate
print(classification_report(y_test, (model.predict(X_test) > 0.5).astype(int)))

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.02      0.00      0.00      1400
           1       0.06      0.00      0.00      1962
           2       0.44      0.44      0.44      1838
           3       0.17      0.04      0.06      1101
           4       0.00      0.00      0.00      1231
           5       0.00      0.00      0.00      1168
           6       0.00      0.00      0.00       923
           7       0.30      0.20      0.24      1543
           8       0.03      0.01      0.01      1176
           9       0.68      0.37      0.48      2570
          10       0.00      0.00      0.00      1192
          11       0.12      0.08      0.09       859
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00       972
          14       0.00      0.00      0.00       276

   micro avg       0.38      0.12      0.18     18211
   ma

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
