# 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np 

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# 2. Import data

In [None]:
train_df = pd.read_csv("./playground-series-s3e20/train.csv")
test_df = pd.read_csv("./playground-series-s3e20/test.csv")

In [None]:
print(train_df.shape)
print(test_df.shape)


# 3. EDA

In [None]:
print(train_df.info())

Data contains information about 7 main features: SO2, CO, NO2, HCHO, UV_Aerosol_Index, O3, Cloud and additional features tied to the main ones.  

## 3.1 Missing data identification

In [None]:
values = train_df.isna().sum()

fig = px.bar(values, text_auto='.2s',
        title="Number of missing values in columns", width=1300, height=800)

fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

Data is missing from almost all columns. In some cases there are only small portions of data missing whereas f.x. in columns containing UV data over 90% of data is missing 

## 3.2 Emission data time series in particular location

In [None]:
def date_from_week_year(row):
    # Calculate the first day of the given year
    # print(row)
    first_day = datetime(int(row.year), 1, 1)

    # Calculate the number of days to the first day of the first week
    days_to_first_weekday = (1 - first_day.weekday()) % 7

    # Calculate the date of the first day of the given week
    start_date = first_day + timedelta(days=days_to_first_weekday + 7 * (int(row.week_no) - 1))

    return start_date

In [None]:
df = train_df[['year', 'week_no', 'emission', 'latitude', 'longitude']]
df['date'] = df.apply(date_from_week_year, axis=1)
df.drop(['year', 'week_no'], axis=1, inplace=True)
chart_df = df[(df['latitude'] == -0.510) & (df['longitude'] == 29.290)]
chart_df

fig = px.line(chart_df, x='date', y='emission', title='Emission data over the time period from: (-0.51, 29.29) location')
fig.update_layout(
    template='plotly_dark'
)
fig.show()

## 3.3 Date features

In [None]:
data_count_by_year = train_df.groupby('year')['emission'].count()

fig = px.bar(train_df, x=train_df['year'].unique() , y=data_count_by_year, title="Emission data per year distribution")
fig.update_layout(
    template='plotly_dark',
    xaxis=dict(
        tickmode='linear',
    )
    
)
fig.show()


In [None]:
data_count_by_week = train_df.groupby('week_no')['emission'].count()

fig = px.bar(train_df, x=train_df['week_no'].unique() , y=data_count_by_week, title="Emission data per year distribution")
fig.update_layout(
    template='plotly_dark',
    xaxis=dict(
        tickmode='linear',
    )
    
)
fig.show()

## 3.4 Correlation

In [None]:
# Top 25 correlated features 
top25 = abs(train_df.corr()['emission']).sort_values(ascending = False).head(20)
top25

In [None]:
corr = train_df[list(top25.index)].corr()
plt.figure(figsize = (13, 8))
sns.heatmap(corr, cmap=plt.cm.CMRmap_r , annot = True, center = 0)
plt.title('Correlation matrix', fontsize = 15)
plt.show()

In [None]:
from unicodedata import numeric


def correlation_threshold(dataset, threshold=0.85):
    col_corr = set()
    corr_matrix = dataset.corr(numeric_only=True)
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
correlated_features = correlation_threshold(train_df, 0.9)
print(correlated_features)
print(f'There are {len(correlated_features)} correlated features with correclation coefficient over 0.9')


There are 20 dependent features which are highly correlated

# 4. Modelling

In [None]:
X = train_df.drop(['ID_LAT_LON_YEAR_WEEK', 'emission'], axis=1).fillna(0)
y = train_df['emission']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=41)

model = CatBoostRegressor(silent=True, random_state=41)
model.fit(X_train, y_train)

model_rf = RandomForestRegressor(random_state=41)
model_rf.fit(X_train, y_train)


# Making predictions
y_pred = model.predict(X_val)
y_pred_rf = model_rf.predict(X_val)

# Measuring the accuracy of the model
print(f'RMSE Score: {mean_squared_error(y_val, y_pred, squared=False)}')
print(f'RMSE Score RF: {mean_squared_error(y_val, y_pred_rf, squared=False)}')


# 5. Predictions and submission

In [None]:
# Make prediction on the test set
df = test_df.drop(['ID_LAT_LON_YEAR_WEEK'], axis = 1).fillna(0)
# predictions = model.predict(df)
predictions = model_rf.predict(df)

# # Create a submission file
sub_file = pd.DataFrame({'ID_LAT_LON_YEAR_WEEK': test_df.ID_LAT_LON_YEAR_WEEK, 'emission': predictions})
sub_file.head()

In [None]:
sub_file.to_csv('submission.csv', index=False)