In [2]:
# Dependencies
import pandas as pd
import numpy as np

In [3]:
# Load data (FRED-MD)
df = pd.read_csv(
    '/home/js/macroeconvue/nowcasting/current.csv',
    index_col='sasdate'
    )
# Drop target variable (CPIAUCSL)
target = df['CPIAUCSL']
df = df.drop(columns=['CPIAUCSL'])

In [4]:
# Make the series stationary
def transform_series(series, code):
    if code == 1:
        return series  # No transformation
    elif code == 2:
        return series.diff().dropna()  # First difference
    elif code == 3:
        return series.diff().diff().dropna()  # Second difference
    elif code == 4:
        return np.log(series).dropna()  # Logarithm
    elif code == 5:
        return np.log(series).diff().dropna()  # First difference of logarithm
    elif code == 6:
        return np.log(series).diff().diff().dropna()  # Second difference of logarithm
    elif code == 7:
        return series.pct_change().dropna()  # Percentage change
    else:
        raise ValueError(f"Unknown transformation code: {code}")

transformed_data = {}
transformation_codes = df.iloc[0]  # Assuming the first row contains the codes
data = df.iloc[1:]  # The actual data starts from the second row

for column in data.columns:
    code = transformation_codes[column]
    transformed_data[column] = transform_series(data[column], code)

df = pd.DataFrame(transformed_data).dropna(how='all')  # Drop rows with all NaN values

In [5]:
from sklearn.preprocessing import StandardScaler

# Standardize the data
df = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns, index=df.index)
df = df.dropna()

In [6]:
# Apply PCA fto look for the number of components to retain
from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# pca = PCA()
# pca.fit(df)

# cumulative_variance = pca.explained_variance_ratio_.cumsum()

# # Plot cumulative explained variance
# plt.figure(figsize=(10, 6))
# plt.plot(cumulative_variance, marker='o', linestyle='--')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('Explained Variance by Number of Components')
# plt.grid(True)
# plt.show()


In [7]:
# Apply PCA to keep 90% of variance
pca = PCA(n_components=0.90)
pca.fit(df)
data = pca.transform(df)
print(f"PCA len: {len(data)}, Original: {len(df)}")

# Look at relationship between original features nad cleanred_df
loadings = pca.components_
loadings_df = pd.DataFrame(loadings.T, index=df.columns, columns=[f'PC{i+1}' for i in range(loadings.shape[0])])
display(loadings_df)

PCA len: 390, Original: 390


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40
RPI,0.034481,-0.081739,-0.068953,-0.092153,0.009781,-0.273636,-0.289818,0.398063,-0.132497,-0.177616,...,-0.071687,-0.005041,-0.003556,-0.051049,0.008884,0.072719,-0.054323,0.024239,0.049147,-0.039312
W875RX1,0.043448,-0.117144,-0.074420,-0.049564,-0.085013,-0.106425,-0.033869,0.130573,-0.013163,0.036071,...,0.309834,-0.047977,0.351154,-0.098933,0.179590,-0.024453,0.067097,-0.105211,-0.143090,-0.060521
DPCERA3M086SBEA,0.122792,-0.063226,-0.098465,0.045354,0.023704,0.010595,-0.070860,0.054086,-0.045489,0.000375,...,-0.099707,-0.034419,-0.024622,-0.045716,-0.109427,0.004947,0.148523,-0.014004,-0.093088,0.080141
CMRMTSPLx,0.096000,-0.072448,-0.100212,0.037263,0.001539,-0.036803,-0.002458,0.029059,-0.040573,-0.023109,...,-0.081770,-0.010774,-0.014119,-0.009034,-0.097109,0.098720,0.032700,-0.078907,0.003814,0.065193
RETAILx,0.140828,-0.029037,-0.079120,0.006845,0.015920,-0.001387,-0.103307,0.101696,-0.048474,-0.019287,...,-0.087943,-0.050115,-0.028114,-0.079790,-0.083041,0.074133,0.169641,-0.062955,-0.105989,0.124353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UMCSENTx,0.031067,-0.006650,-0.044416,0.028852,-0.058411,0.114273,-0.151772,0.091813,0.027816,0.097119,...,0.362074,-0.027771,-0.287264,0.412850,-0.305325,0.155843,-0.113642,0.073663,0.144311,0.235216
DTCOLNVHFNM,0.016934,0.005823,-0.016231,0.074108,0.210359,0.232328,0.152782,0.341158,0.016444,-0.081195,...,0.063754,-0.017200,0.029651,0.112404,0.019135,-0.217275,0.207222,-0.080242,0.396246,-0.266723
DTCTHFNM,0.016773,0.009038,-0.007257,0.021044,0.276215,0.248237,0.246983,0.447468,0.124558,-0.032936,...,-0.039265,0.031558,0.001580,-0.064861,0.029045,0.085820,-0.091886,0.086665,-0.098595,0.124691
INVEST,0.011748,0.018594,0.004939,-0.006219,0.003708,-0.179617,0.026435,-0.033200,0.013989,0.133150,...,-0.138504,-0.079306,0.125632,-0.197202,-0.239922,-0.208509,0.214371,0.063920,0.002685,-0.044721


In [8]:
# Prepare data for the LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Convert data into sequences
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i+time_steps])
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10  # Choose based on your data
X, y = create_sequences(data, target.values, time_steps)

# Split into train and test
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


2025-03-21 12:52:20.733394: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-21 12:52:20.763201: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 12:52:20.955514: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-21 12:52:21.153398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742575941.318813     821 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742575941.37

In [None]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

# Build the LSTM model
model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(50, activation='relu', return_sequences=False, kernel_regularizer=l2(0.01)),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)
])

# Compile and train
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

NameError: name 'l2' is not defined

In [13]:
# Evaluate performance
from sklearn.metrics import mean_squared_error
train_mse = mean_squared_error(y_train, train_predict)
test_mse = mean_squared_error(y_test, test_predict)
print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")
# Inverse transform to get actual values
# train_predict = target.iloc[:split].values + train_predict.flatten()
# test_predict = target.iloc[split:].values + test_predict.flatten()
# # Plot the results
# import matplotlib.pyplot as plt
# plt.figure(figsize=(14, 7)) 
# plt.plot(target.index[:split], target.iloc[:split], label='Train Actual', color='blue')
# plt.plot(target.index[split:], target.iloc[split:], label='Test Actual', color='orange')
# plt.plot(target.index[:split], train_predict, label='Train Predicted', color='green')
# plt.plot(target.index[split:], test_predict, label='Test Predicted', color='red')
# plt.title('LSTM Model Predictions vs Actual')
# plt.xlabel('Date')
# plt.ylabel('CPIAUCSL')
# plt.legend()
# plt.show()

Train MSE: 861.3079092023606
Test MSE: 7619.428151310371
