# Starting Machine Learning Model

## Importing Libs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pycaret as pc
from pycaret.regression import setup, compare_models
import numpy as np
import seaborn as sns

## Reading the Data

In [None]:
def resetIndex(data):
    data = data.reset_index()
    return data.head()

In [None]:
FedFundsRate = pd.read_excel('../data/FedFundsRate.xlsx', index_col=0, engine='openpyxl')
resetIndex(FedFundsRate)

In [None]:
M2 = pd.read_excel('../data/M2.xlsx', index_col=0, engine='openpyxl')
resetIndex(M2)

In [None]:
Median_CPI = pd.read_excel('../data/Median_CPI.xlsx', index_col=0, engine='openpyxl')
resetIndex(Median_CPI)

In [None]:
Stick_CPI = pd.read_excel('../data/StickyCPI_Less_Food_And_Energy.xlsx', index_col=0, engine='openpyxl')
resetIndex(Stick_CPI)

In [None]:
UnemploymentRate_25_34Y = pd.read_excel('../data/UnemploymentRate_25-34Y.xlsx', index_col=0, engine='openpyxl')
resetIndex(UnemploymentRate_25_34Y)

In [None]:
UnemploymentRate_45_54Y = pd.read_excel('../data/UnemploymentRate_45-54Y.xlsx', index_col=0, engine='openpyxl')
resetIndex(UnemploymentRate_45_54Y)

In [None]:
UnemploymentRate_55_64Y = pd.read_excel('../data/UnemploymentRate_55-64Y.xlsx', index_col=0, engine='openpyxl')
resetIndex(UnemploymentRate_55_64Y)

In [None]:
UnemploymentRate_65_OverY = pd.read_excel('../data/UnemploymentRate_65-OverY.xlsx', index_col=0, engine='openpyxl')
resetIndex(UnemploymentRate_65_OverY)

In [None]:
ComercialAndIndustrialLoans = pd.read_excel('../data/Commercial and Industrial Loans, All Commercial Banks, Percent Change at Annual Rate.xlsx', index_col=0, engine='openpyxl')
resetIndex(ComercialAndIndustrialLoans)

In [None]:
FederalDebt = pd.read_excel('../data/Federal Debt Total Public Debt as Percent of Gross Domestic Product, Percent of GDP.xlsx', index_col=0, engine='openpyxl')
resetIndex(FederalDebt)

In [None]:
FederalDebt = FederalDebt.resample('D').ffill()

FederalDebt.reset_index(inplace=True)

FederalDebt.head()


## Merge Data

In [None]:
def merge(*dfs):
    if len(dfs) < 2:
        raise ValueError("Insufficient number of DataFrames to merge. Please provide at least 2 DataFrames.")
    
    merged_df = dfs[0]
    
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on='observation_date', how='inner')
    
    return merged_df

In [None]:
merged_df = merge(FedFundsRate, M2, Median_CPI, Stick_CPI, UnemploymentRate_25_34Y, UnemploymentRate_45_54Y, UnemploymentRate_55_64Y, UnemploymentRate_65_OverY, ComercialAndIndustrialLoans, FederalDebt)
resetIndex(merged_df)

## Testing Models

In [None]:
merged_df.to_excel('../data/merged_df.xlsx')

In [None]:
merged_df = pd.read_excel('../data/merged_df.xlsx', index_col=0, engine='openpyxl')
merged_df = merged_df.reset_index()
merged_df = merged_df.drop(columns=['observation_date', 'index'])
merged_df.head()

### Finding the best model

In [None]:
setup(merged_df, target='Median CPI')

best_model = compare_models()

### Importing Libs

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel

### BayesianRidge

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

param_grid = {
    'alpha_1': [1e-6, 1e-5, 1e-4],
    'alpha_2': [1e-6, 1e-5, 1e-4],
    'lambda_1': [1e-6, 1e-5, 1e-4],
    'lambda_2': [1e-6, 1e-5, 1e-4],
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

bayesian_ridge = BayesianRidge()

grid_search = GridSearchCV(estimator=bayesian_ridge, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best Params: ", best_params)

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

bayesian_ridge = BayesianRidge()

bayesian_ridge.fit(X_train, y_train)

scores = cross_val_score(bayesian_ridge, X_train, y_train, cv=5) 
print(f"Cross-validation accuracy: {scores.mean()} (+/- {scores.std() * 2})")

test_score = bayesian_ridge.score(X_test, y_test)
print(f"Test data accuracy: {test_score}")

final_decision = bayesian_ridge.predict(X_test[-1:])
print("Final Decision: ", final_decision)

### Ridge

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

ridge = Ridge()

grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best Params: ", best_params)

In [None]:
ridge = Ridge(**best_params, random_state=42)

selector = SelectFromModel(estimator=ridge).fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

ridge.fit(X_train_selected, y_train)

scores = cross_val_score(ridge, X_train_selected, y_train, cv=5) 
print(f"Cross-validation accuracy: {scores.mean()} (+/- {scores.std() * 2})")

y_pred = ridge.predict(X_test_selected)

test_score = ridge.score(X_test_selected, y_test)
print(f"Test data accuracy: {test_score}")

final_decision = ridge.predict(X_test_selected[-1:])
print("Final Decision: ", final_decision)

### Linear Regression

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True, False]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

linear_reg = LinearRegression()

grid_search = GridSearchCV(estimator=linear_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best Params: ", best_params)

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

scores = cross_val_score(linear_reg, X_train, y_train, cv=5) 
print(f"Cross-validation accuracy: {scores.mean()} (+/- {scores.std() * 2})")

test_score = linear_reg.score(X_test, y_test)
print(f"Test data accuracy: {test_score}")

final_decision = linear_reg.predict(X_test[-1:])
print("Final Decision: ", final_decision)

### K Neighbors Regressor

In [None]:
imputer = SimpleImputer(strategy='mean')  

num_cols = merged_df.select_dtypes(include=[np.number]).columns

df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

knn = KNeighborsRegressor()

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best Params: ", best_params)

In [None]:
imputer = SimpleImputer(strategy='mean')  
num_cols = merged_df.select_dtypes(include=[np.number]).columns
df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn = KNeighborsRegressor(**best_params)

knn.fit(X_train, y_train)

scores = cross_val_score(knn, X_train, y_train, cv=5) 
print(f"Cross-validation accuracy: {scores.mean()} (+/- {scores.std() * 2})")

test_score = knn.score(X_test, y_test)
print(f"Test data accuracy: {test_score}")

final_decision = knn.predict(X_test[-1:])
print("Final Decision: ", final_decision)

In [None]:
plt.figure(figsize=(10, 6))

sns.lineplot(data=y_test, label='True Values', color='blue')
plt.plot(len(y_test) - 1, final_decision, 'ro', markersize=10, label='Final Decision')

plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Final Decision vs True Values')

plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

In [None]:
correlation_matrix = merged_df.corr()

plt.figure(figsize=(10, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

plt.title('Correlation Matrix')

plt.show()

### Extra Trees Regressor

In [None]:
imputer = SimpleImputer(strategy='mean')  

num_cols = merged_df.select_dtypes(include=[np.number]).columns

df_imputed = pd.DataFrame(imputer.fit_transform(merged_df[num_cols]), columns=num_cols)

X = df_imputed.drop('Median CPI', axis=1)
y = df_imputed['Median CPI']

param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

et = ExtraTreesRegressor(random_state=42)

grid_search = GridSearchCV(estimator=et, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Best Params: ", best_params)

In [None]:
etr = ExtraTreesRegressor(**best_params, random_state=42)

selector = SelectFromModel(estimator=etr).fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

etr.fit(X_train_selected, y_train)

scores = cross_val_score(etr, X_train_selected, y_train, cv=5) 
print(f"Cross-validation accuracy: {scores.mean()} (+/- {scores.std() * 2})")

y_pred = etr.predict(X_test_selected)

test_score = etr.score(X_test_selected, y_test)
print(f"Test data accuracy: {test_score}")

final_decision = etr.predict(X_test_selected[-1:])
print("Final Decision: ", final_decision)

In [None]:
plt.figure(figsize=(10, 6))

sns.lineplot(data=y_test, label='True Values', color='blue')
plt.plot(len(y_test) - 1, final_decision, 'ro', markersize=10, label='Final Decision')

plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Final Decision vs True Values')

plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

In [None]:
correlation_matrix = merged_df.corr()

plt.figure(figsize=(10, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

plt.title('Correlation Matrix')

plt.show()