In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('../data/final/wrangled/normalized.csv', index_col='year')
df_diff = df.copy()
df = df[[c for c in df.columns if c[-5:] != '_diff']]

If Overfitting caused by extra _diff variables is the hypothesis for worse performance, what if we perform PCA on both _diff and no diff datasets and see how the same model architecture performs on both?

# df

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import altair as alt

# all X in df is already normalized

# Apply PCA
pca = PCA(n_components=2)  # You can change the number of components
df_pca = pca.fit_transform(df)

# Convert the result back to a DataFrame
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'], index=df.index)
df_pca['instability'] = df_diff['instability']

# Create a scatter plot of the PCA results
scatter = alt.Chart(df_pca.reset_index()).mark_circle().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('instability', scale=alt.Scale(scheme='redblue', reverse=True), label='Instability'),
    tooltip=['year', 'PC1', 'PC2', 'instability']
)

scatter

In [89]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


X = df_pca[['PC1', 'PC2']]
y = df_pca['instability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the neural network model on the training set
model = MLPRegressor(hidden_layer_sizes=(8,16,8),max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict the values for the training set
y_train_pred = model.predict(X_train)

# Predict the values for the test set
y_test_pred = model.predict(X_test)

# Calculate R² and MSE for the training set
train_r_squared = model.score(X_train, y_train)
train_mse = mean_squared_error(y_train, y_train_pred)

# Calculate R² and MSE for the training set
train_r_squared = model.score(X_train, y_train)
train_mse = mean_squared_error(y_train, y_train_pred)

# Calculate R² and MSE for the test set
test_r_squared = model.score(X_test, y_test)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f'Neural Network Train R²: {train_r_squared}')
print(f'Neural Network Train MSE: {train_mse}')
print(f'Neural Network Test R²: {test_r_squared}')
print(f'Neural Network Test MSE: {test_mse}')

# Calculate R² and MSE for the entire dataset
y_pred = model.predict(X)
r_squared_full = r2_score(y, y_pred)
mse_full = mean_squared_error(y, y_pred)

print(f'Full Dataset R²: {r_squared_full}')
print(f'Full Dataset MSE: {mse_full}')


Neural Network Train R²: 0.9213866822153132
Neural Network Train MSE: 0.17760461125033844
Neural Network Test R²: 0.9110625297601492
Neural Network Test MSE: 0.1577296269378104
Full Dataset R²: 0.9196842829547742
Full Dataset MSE: 0.17360380271989448


In [90]:
df_pca.to_csv('../data/final/wrangled/pca.csv')

# df_diff

In [91]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import altair as alt

# all X in df is already normalized

# Apply PCA
pca = PCA(n_components=2)  # You can change the number of components
df_pca = pca.fit_transform(df_diff)

# Convert the result back to a DataFrame
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'], index=df.index)
df_pca['instability'] = df_diff['instability']

# Create a scatter plot of the PCA results
scatter = alt.Chart(df_pca.reset_index()).mark_circle().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('instability', scale=alt.Scale(scheme='redblue', reverse=True)),
    tooltip=['year', 'PC1', 'PC2', 'instability']
)

scatter

In [92]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


X = df_pca[['PC1', 'PC2']]
y = df_pca['instability']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the neural network model on the training set
model = MLPRegressor(hidden_layer_sizes=(8,16,8),max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict the values for the training set
y_train_pred = model.predict(X_train)

# Predict the values for the test set
y_test_pred = model.predict(X_test)

# Calculate R² and MSE for the training set
train_r_squared = model.score(X_train, y_train)
train_mse = mean_squared_error(y_train, y_train_pred)

# Calculate R² and MSE for the training set
train_r_squared = model.score(X_train, y_train)
train_mse = mean_squared_error(y_train, y_train_pred)

# Calculate R² and MSE for the test set
test_r_squared = model.score(X_test, y_test)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f'Neural Network Train R²: {train_r_squared}')
print(f'Neural Network Train MSE: {train_mse}')
print(f'Neural Network Test R²: {test_r_squared}')
print(f'Neural Network Test MSE: {test_mse}')

# Calculate R² and MSE for the entire dataset
y_pred = model.predict(X)
r_squared_full = r2_score(y, y_pred)
mse_full = mean_squared_error(y, y_pred)

print(f'Full Dataset R²: {r_squared_full}')
print(f'Full Dataset MSE: {mse_full}')


Neural Network Train R²: 0.9380004861781104
Neural Network Train MSE: 0.14007040868324228
Neural Network Test R²: 0.8659575842147939
Neural Network Test MSE: 0.23772275261063122
Full Dataset R²: 0.9261038960457608
Full Dataset MSE: 0.15972769869459977


It seems feature engineering is unnecessary for models that can handle non-linearities. From now on I will use the dataset without diff.