In [None]:
#Imports
import glob
import os
import pandas as pd

In [None]:
#Global path
pre_path = "C:\\Users\\millen11\\Dropbox\\PC\\Documents\\academia\\rpi\\classes\\fall23\\introToML\\introToMLapps\\project\\dataset\\cleaned\\"

In [None]:
#Read in external facing indices
ecpi_df = pd.read_csv(pre_path + "ecpi_df.csv")
ccpi_df = pd.read_csv(pre_path + "ccpi_df.csv")
epsi_df = pd.read_csv(pre_path + "epsi_df.csv")
gscpi_df = pd.read_csv(pre_path + "gscpi_df.csv")

#Merge ecpi and ccpi
ecpi_ccpi_df = pd.merge(ecpi_df, ccpi_df, on = ["Date","Country"], how = "inner")
display(ecpi_ccpi_df)

#Merge ecpi + ccpi and epsi
epsi_ecpi_ccpi_df = pd.merge(epsi_df, ecpi_ccpi_df, on = ["Date","Country"], how = "outer")

#Merge ecpi + ccpi + epsi and gscpi
external_factors_df = pd.merge(epsi_ecpi_ccpi_df, gscpi_df, on = ["Date"], how = "inner")
display(external_factors_df)
# display(external_factors_df.loc[external_factors_df["GSCPI"].isna()]) #confirming there are no NaNs in GSCPI

In [None]:
#Sidenote: Potential contextualization of missing data points
external_indices = pd.merge(epsi_df, ecpi_ccpi_df, on = ["Date","Country"], how = "outer")
null_df = external_indices.loc[external_indices["CCPI"].isnull()]
display(null_df)
print(set(null_df["Country"]))

'''
Based on the above data, there is 7 countries that do not have ECPI's or CCPI's in 2010. Given this, 
I believe a potential reason for this is said countries wanting to withold these consumer economic facing indices
post recession to potentially save face. Additionally, many of these countries are oftentimes viewed as extremely stringent
on how they're percieved and what sort of information they release. Again, there is no gurantee, that either of these things could be
the reason for these NaNs, but just a thought.
'''

In [None]:
'''
The prediction plan --
Process and thinking for dealing with missing values in EPSI, ECPI, and CCPI. Since, I need these features to contextualize
the market and I need data from 2010 to 2023 with as many countries as possible, to maximize scope and generality, I must find a way to
deal with their NaN's to answer our original question (See README.md). So here's the plan:
  1. Deal with EPSI NaNs (O1)
    a. Fill CCPI and ECPI with future datapoints
    b. One-hot encode
    c. Predict EPSI
  2. Use the newlyfound ESPI's to predict the missing CCPI and ECPI values to try to get most accurate results (O2)
    a. One-hot encode
    b. Predict CCPI
    c. Predict ECPI (May benefit from CCPI being filler first, since ECPI derives from CCPI)
  3. Repredict the EPSI values given that our 1100 rows, about 6.4 percent of our data is better represented now (O3)
    a. One-hot encode
    b. Predict EPSI
    
Although this approach is genuinely strange, I feel it is a good solution, since
  1. The 1139/17625 rows of missing CCPI/ECPI values, is at large small. However, I would like to minimize its affect on the EPSI
  since the EPSI values from 2021 to 2023 have more insights to bring.
  2. Using a temporary EPSI that has minimal affect on the these decade old CCPI and ECPI values, gives a more accurate version of what the actual
  EPSI would be like than the general mean, since environmental policy has been rapidly changing nowadays

NOTE:
  Did not realize that EPSI also has nulls pre 2021. However, we will continue with the approach since the EPSI then will not be AS important
'''

In [None]:
#Testing data: Data we want predicted
predict_df = external_factors_df.loc[external_factors_df["EPSI"].isna()]
display(predict_df)
print(set(predict_df["Country"]))

In [None]:
#Extrapolate 2021 to 2023 EPSI values
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

X = external_factors_df.drop(columns = ["EPSI"])
y = external_factors_df["EPSI"]

In [None]:
#Find what model is best for task
#Filling CCPI and ECPI NaN's with next row/datapoint
X_ffill = X.fillna(method = "ffill", axis = 0)
X_train, X_test, y_train, y_test = train_test_split(X_ffill, y, train_size = .7, random_state = 1)

#Perform one-hot encoding
X_train_encoded = pd.get_dummies(X_train, columns = ["Country", "Date"])
display(X_train_encoded)
X_test_encoded =  pd.get_dummies(X_test, columns = ["Country", "Date"])


#Temporarily fill NaN EPSI values with the mean EPSI values
y_train_fill = y_train.fillna(y_train.mean())
display(y_train_fill)
y_test_fill = y_test.fillna(y_train.mean())

In [None]:
#Linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_encoded, y_train_fill)
display(X_train_encoded)
y_pred = lin_reg.predict(X_test_encoded)

mse = mean_squared_error(y_test_fill, y_pred)
print(mse)

print(r2_score(y_test_fill, y_pred)) 

In [None]:
#Viewing the autocorrelation of EPSI to view the seasonality
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf

# Assuming 'data' is your time series data
Z = external_factors_df.set_index("Date")
figure, axis = plt.subplots(figsize = (12, 6))
plot_acf(y_pred, lags = 36, ax = axis)
plt.title("Autocorrelation of EPSI")
plt.show()


In [None]:
#Random Forest regressor
rfg = RandomForestRegressor(n_estimators = 100, random_state = 1)
rfg.fit(X_train_encoded, y_train_fill)

y_pred = rfg.predict(X_test_encoded)

mse = mean_squared_error(y_test_fill, y_pred)
print(mse)

print(r2_score(y_test_fill, y_pred))

In [None]:
#Gradient Boosting regressor
gbr = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 1)
gbr.fit(X_train_encoded, y_train_fill)

y_pred = gbr.predict(X_test_encoded)

mse = mean_squared_error(y_test_fill, y_pred)
print(mse)

print(r2_score(y_test_fill, y_pred))

In [None]:
#MLP regressor
mlp = MLPRegressor(max_iter = 100, random_state = 1)
mlp.fit(X_train_encoded, y_train_fill)

y_pred = mlp.predict(X_test_encoded)

mse = mean_squared_error(y_test_fill, y_pred)
print(mse)

print(r2_score(y_test_fill, y_pred))

In [None]:
'''
Since, Linear Regression has performed the best, we will use it to as our NaN prediction model
'''

In [None]:
def predict_NaN(column_name:str, NaN_df: pd.DataFrame) -> pd.DataFrame: #Predicts NaN values of a given feature
  encoded_predict_df = pd.get_dummies(NaN_df, columns = ["Country", "Date"])
  X_predict = encoded_predict_df.drop(columns = [column_name])
  missing_cols = set(X_train_encoded.columns) - set(X_predict.columns)
  for col in missing_cols:
    X_predict[col] = 0 #Add missing column with a temp, 0
  X_predict = X_predict[X_train_encoded.columns]
  y_pred = lin_reg.predict(X_predict)
  predicted_df = NaN_df.copy()
  predicted_df.loc[predicted_df[column_name].isna(), column_name] = y_pred
  return predicted_df

In [None]:
def fill_main_df(column_name: str, predicted_df: pd.DataFrame, filling_df: pd.DataFrame) -> pd.DataFrame: #Fills primary dataframe with respective features missing values
  predicted_df = predicted_df.reset_index(drop = True)
  filling_df_indices = filling_df[filling_df[column_name].isna()].index
  for i, nan_index in enumerate(filling_df_indices):
    filling_df.loc[nan_index, column_name] = predicted_df.loc[i, column_name]
  return filling_df
    

In [None]:
#O1
predicted_epsi1_df = predict_NaN("EPSI", predict_df) #Predicted EPSI based on temporary ECPI and CCPI values
# display(predicted_epsi1_df)

op_df = fill_main_df("EPSI", predicted_epsi1_df, external_factors_df.copy()) #Temporary version of predicted EPSI based on the prediction plan
# display(op_df)

In [None]:
#O2
op2_df = external_factors_df[["CCPI", "ECPI"]] 
# display(op2_df)
restored_df = op_df.drop(columns = ["CCPI", "ECPI"])
restored_df = pd.concat([restored_df, op2_df], axis = 1) #Dataframe with null CCPI and ECPI values and predicted EPSI values 
# display(restored_df)
temp_df = restored_df.drop(columns = ["CCPI"])
temp_ecpi_df = temp_df.fillna(method = "ffill", axis = 0) #Temporarily dealing with NaN ECPI values
temp_ecpi_df = pd.concat([temp_ecpi_df, restored_df["CCPI"]], axis = 1)
temp_ecpi_df = temp_ecpi_df.rename(columns = {0: "CCPI"})
# display(temp_ecpi_df)

nan_CPI = temp_ecpi_df.loc[temp_ecpi_df["CCPI"].isna()]
display(nan_CPI)
predicted_cpi_df = predict_NaN("CCPI", nan_CPI) #Predicted CPI values

op_df = fill_main_df("CCPI", predicted_cpi_df, op_df)
display(op_df)

In [None]:
op2_df = op_df.drop(columns = ["ECPI"])
restored_df = pd.concat([op2_df, op_df["ECPI"]], axis = 1)

nan_ECPI = restored_df.loc[restored_df["ECPI"].isna()]
# display(nan_ECPI)
predicted_ecpi_df = predict_NaN("ECPI", nan_ECPI)

op_df = fill_main_df("ECPI", predicted_ecpi_df, op_df)
display(op_df)

In [None]:
#O3
op_df = op_df.drop(columns = ["EPSI"])
op_df = pd.concat([op_df, external_factors_df["EPSI"]], axis = 1)
nan_EPSI = op_df.loc[op_df["EPSI"].isna()]
display(nan_EPSI)
predicted_epsi2_df = predict_NaN("EPSI", nan_EPSI)

external_factors_df = fill_main_df("EPSI", predicted_epsi2_df, op_df)
display(external_factors_df)

In [None]:
external_factors_df.to_csv(pre_path + "external_indic.csv")