Importing libraries used to load the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import numpy as np

Loading the Meteo and Ozone datasets

In [None]:
# Loading the ozone concentration data
ozone_data = pd.read_csv('1988_Ozone.csv')

# Loading the wind speed and temperature data
weather_data = pd.read_csv('meteo_data.csv')

Data Processing steps


In [None]:
# Converting the 'Date' column to datetime format
ozone_data['Date'] = pd.to_datetime(ozone_data['Date'])
# Filtering ozone data for summer months (June, July, August)
summer_ozone_data = ozone_data[(ozone_data['Date'].dt.month >= 6) & (ozone_data['Date'].dt.month <= 8)]

# Converting the 'Date' column to datetime format
weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
# Filtering weather data for summer months (June, July, August)
summer_weather_data = weather_data[(weather_data['DATE'].dt.month >= 6) & (weather_data['DATE'].dt.month <= 8)]

In [None]:
from sklearn.model_selection import train_test_split

# Merging the two datasets based on the 'Date' column
combined_data = pd.merge(summer_ozone_data, summer_weather_data, left_on='Date',
                         right_on='DATE', how='inner')

#Removing NaN values
combined_data.dropna(inplace=True)

# Calculate the min and max values for temperature and windspeed
min_temp = combined_data['TEMP'].min()
max_temp = combined_data['TEMP'].max()
min_windspeed = combined_data['WDSP'].min()
max_windspeed = combined_data['WDSP'].max()

# Apply min-max scaling to temperature and windspeed columns
combined_data['TEMP_normalized'] = (combined_data['TEMP'] - min_temp) / (max_temp - min_temp)
combined_data['WDSP_normalized'] = (combined_data['WDSP'] - min_windspeed) / (max_windspeed - min_windspeed)

# Features extraction (wind speed and temperature)
X = combined_data[['WDSP_normalized', 'TEMP_normalized']]

# Target (ozone concentration)
y = combined_data['Daily Max 8-hour Ozone Concentration']

In [None]:
# Splitting the data into training and testing sets
train_x, test_x, train_y, test_y= train_test_split(
    X, y, test_size=0.1, random_state=42)

In [None]:
train_y = train_y.to_frame()
test_y= test_y.to_frame()

Injecting 10% Outliers in the training set

In [None]:
# Define the proportion of outliers
outlier_proportion = 0.10

# Calculate the number of outliers to inject
num_outliers = int(len(train_y) * outlier_proportion)

# Generate extreme outlier values
extreme_outliers_float = np.random.uniform(low=2, high=6, size=num_outliers)

# Create outlier indices
random_indices = np.random.choice(len(train_y), size=num_outliers, replace=False)

# Outlier generation in the train response variable
train_y_outliers = np.copy(train_y)
extreme_outliers_float = extreme_outliers_float.reshape(-1, 1)
train_y_outliers[random_indices] = extreme_outliers_float