Data set based on weather observations from 18 different weather stations across Europe, which contain data ranging from the late 1800s to 2022. This data is collected by the European Climate Assessment & Data Set project.

## 1. Importing data and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler

In [2]:
path = '/Users/mariazaremba/Documents/CareerFoundry/Machine Learning with Python'

In [3]:
df = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'Dataset-weather-prediction-dataset-processed.csv'), index_col = False)

In [4]:
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [5]:
df.reset_index(inplace=True)
df.rename(columns={'index':'id'}, inplace=True)
df.head()

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22950 entries, 0 to 22949
Columns: 171 entries, id to VALENTIA_temp_max
dtypes: float64(145), int64(26)
memory usage: 29.9 MB


In [7]:
df.shape

(22950, 171)

# 2 Scaling

### 2.1 Subset data

In [8]:
# Identify columns to scale
columns = [col for col in df.columns if col not in ['id','DATE','MONTH']]

In [9]:
# Create a DataFrame (example data)
data = {col: range(1, 22951) for col in columns}
df_new = pd.DataFrame(data)

In [10]:
df_new.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
3,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
4,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [11]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22950 entries, 0 to 22949
Columns: 168 entries, BASEL_cloud_cover to VALENTIA_temp_max
dtypes: int64(168)
memory usage: 29.4 MB


In [12]:
df_new.shape

(22950, 168)

### 2.2 Check for (and handle) missing values

In [13]:
missing_values = df_new.isnull().sum()

In [14]:
# Check if there are any missing values
if missing_values.sum() == 0:
    print("No missing data")
else:
    # Print the columns with missing values
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])

No missing data


### 2.3 Scaling

In [15]:
# Initialize the scaler
scaler = StandardScaler()

In [16]:
# Fit and transform the data
scaled_data = scaler.fit_transform(df_new)

In [17]:
# Create DataFrame with scaled data
df_scaled = pd.DataFrame(scaled_data)

# Set the original column names
df_scaled.columns = df_new.columns
df_scaled.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,...,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975,-1.731975
1,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,...,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824,-1.731824
2,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,...,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673,-1.731673
3,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,...,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523,-1.731523
4,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,...,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372,-1.731372


In [18]:
# Compare the original vs. scaled data

df_new.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
3,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
4,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [19]:
# Extract the DATE column from the original dataframe
date_column = df['DATE']

# Add the DATE column to the scaled dataframe
df_new['DATE'] = date_column

# Reorder the columns to have DATE as the first column (if needed)
df_new = df_new[['DATE'] + [col for col in df_new.columns if col != 'DATE']]

# Display the first few rows to verify
df_new.head()

Unnamed: 0,DATE,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,19600102,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,19600103,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
3,19600104,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
4,19600105,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [22]:
df_new.shape

(22950, 169)

In [20]:
df_scaled.to_csv(os.path.join(path, 'Data','Prepared Data', 'Dataset-weather-prediction-dataset-processed_scaled.csv'))

In [21]:
df_new.to_csv(os.path.join(path, 'Data','Prepared Data', 'Dataset-weather-prediction-dataset-processed_subset.csv'))