In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from rwanda.src.utils import get_repo_root
from rwanda.src.impute import impute_data


In [None]:
# Read training data
train = pd.read_csv(f'{get_repo_root()}/data/train.csv')
# Read test data
test = pd.read_csv(f'{get_repo_root()}/data/test.csv')

In [None]:
# From data frame train, extract from ID_LAT_LON_YEAR_WEEK column ID_LAT_LON as a new column
train['ID_LAT_LON'] = ["_".join(x.split('_')[0:3]) for x in train['ID_LAT_LON_YEAR_WEEK']]
test['ID_LAT_LON'] = ["_".join(x.split('_')[0:3]) for x in test['ID_LAT_LON_YEAR_WEEK']]
# From data frame train, extract from ID_LAT_LON_YEAR_WEEK column YEAR_WEEK as a new column
train["year_week"] = ["_".join(x.split('_')[3:5]) for x in train['ID_LAT_LON_YEAR_WEEK']]
test["year_week"] = ["_".join(x.split('_')[3:5]) for x in test['ID_LAT_LON_YEAR_WEEK']]

Imputation

In [None]:
train, test = impute_data(train, test)

In [None]:
train['is_train'] = 1
test['is_train'] = 0

# Concat train and test
concat_df = pd.concat([train, test], axis=0)

Sulphur

In [None]:
# Plot the correlations between all variables starting with Sulphur in train
#sns.heatmap(concat_df.filter(regex='^Sulphur').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
# Do a PCA on variables X
X = '^Sulphur'
comps = 6

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]


Nitrogen

In [None]:
# Plot the correlations between all variables starting with Nitrogen in train
# sns.heatmap(concat_df.filter(regex='^Nitrogen').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
# Do a PCA on variables X
X = '^Nitrogen'
comps = 4

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]


Formaldehyde

In [None]:
# Plot the correlations between all variables starting with Nitrogen in train
sns.heatmap(concat_df.filter(regex='^Formaldehyde').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
X = '^Form'
comps = 5

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]

CarbonMonoxide_sensor_azimuth_angle

In [None]:
# Plot the correlations between all variables starting with Nitrogen in train
sns.heatmap(concat_df.filter(regex='^Carbon').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
X = '^Carbon'
comps = 4

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]

UvAerosolIndex_

In [None]:
sns.heatmap(concat_df.filter(regex='^Uv').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
X = '^Uv'
comps = 3

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]

Ozone

In [None]:
sns.heatmap(concat_df.filter(regex='^Ozone').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
X = '^Ozone'
comps = 4

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]

Cloud

In [None]:
sns.heatmap(concat_df.filter(regex='^Cloud').corr(), cmap='coolwarm', annot=True, vmin=-1, vmax=1)

In [None]:
X = '^Cloud'
comps = 3

pca = PCA(n_components=comps)
# Standardize concat_df.filter(regex='^Sulphur')) columns
tmp = StandardScaler().fit_transform(concat_df.filter(regex=X))
# Fit the PCA model
pca.fit(tmp)
# Plot the explained variance ratio
print(pca.explained_variance_ratio_)
# Replace the original variables with the principal components
components = pca.transform(tmp)
## Drop the original variables
concat_df = concat_df.drop(concat_df.filter(regex=X).columns, axis=1)
## Add the principal components to the dataframe
for i in range(comps):
    concat_df[f'{X[1:]}_{i+1}'] = components[:, i]