# SVR for CPI prediction

In [140]:
#Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


In [165]:
def preprocess_data(data, lag):
    """Preprocesses the CPI data by removing unwanted columns, handling missing values, creating lag features, 
    and scaling the features. Additionally, returns the scaler used for future data transformation.
    
    Parameters:
        data (DataFrame): The CPI data.
        lag (int): The number of months to lag in the feature creation.
        
    Returns:
        DataFrame: The preprocessed and feature-engineered data.
        StandardScaler: The scaler used for feature scaling.
    """
    # Remove unwanted columns
    data.drop('12 Month MA / Current Inflation', axis=1, inplace=True)
    
    # Remove rows with missing values
    data.dropna(inplace=True)
    
    # Create a lagged CPI feature
    column_name = f'CPI {lag} Month{"s" if lag > 1 else ""} ahead'
    data[column_name] = data['CPI'].shift(-lag)
    
    # Remove the 'Year and Month' column
    data.drop('Year and Month', axis=1, inplace=True)
    
    # Scale the features
    scaler = StandardScaler()
    data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])
    
    return data, scaler

# Load the dataset
data = pd.read_csv('Monthly_data.csv')

#Lag - how many months ahead we want to predict
lag = 3

# Preprocess the data and save the scaler for later use
data_processed, scaler = preprocess_data(data, lag)

Train test split

In [161]:
# Split the data into training and testing sets randomly
X_train, X_test, y_train, y_test = train_test_split(
    data_processed.iloc[:-lag, :-1], data_processed.iloc[:-lag, -1], test_size=0.2)

# Initialize the SVR model with specific parameters
model = SVR(C=20, epsilon=0.05)

# Train the model using the training dataset
model.fit(X_train, y_train)

# Predict the CPI values for the training dataset
y_pred_train = model.predict(X_train)

# Predict the CPI values for the test dataset
y_pred_test = model.predict(X_test)

# Calculate and print the Mean Squared Error (MSE) for the training dataset
mse_train = mean_squared_error(y_train, y_pred_train)
print("Train Mean Squared Error (MSE):", mse_train)

# Calculate and print the Mean Squared Error (MSE) for the test dataset
mse_test = mean_squared_error(y_test, y_pred_test)
print("Test Mean Squared Error (MSE):", mse_test)


Train Mean Squared Error (MSE): 0.04759297366944496
Test Mean Squared Error (MSE): 0.2166349882897096


Monte Carlo Cross validation

In [None]:
# Initialize variables to accumulate MSEs
total_mse_train = 0
total_mse_test = 0

# Number of Monte Carlo simulations
n_simulations = 1000

for _ in range(n_simulations):
    # Split the data into training and testing sets randomly
    X_train, X_test, y_train, y_test = train_test_split(
        data_processed.iloc[:-lag, :-1], data_processed.iloc[:-lag, -1], test_size=0.2)
    
    # Initialize and train the SVR model
    model = SVR(C=20, epsilon=0.05)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate MSE for the current split
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    # Accumulate MSEs
    total_mse_train += mse_train
    total_mse_test += mse_test

# Calculate the average MSE over all simulations
average_mse_train = total_mse_train / n_simulations
average_mse_test = total_mse_test / n_simulations

print("Average Train Mean Squared Error (MSE):", average_mse_train)
print("Average Test Mean Squared Error (MSE):", average_mse_test)


10-fold Cross validation

In [None]:
# Number of folds and repeats
n_folds = 10
n_repeats = 500  

# Creating the SVR model
svr_model = SVR(C=20, epsilon=0.05)

# Preparing the data
X = data_processed.iloc[:-lag, :-1]  # input features
y = data_processed.iloc[:-lag, -1]   # target variable

# Creating a RepeatedKFold object
rkf = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats)

# Calculating cross-validation scores using neg_mean_squared_error to maximize the score
cv_scores = cross_val_score(svr_model, X, y, cv=rkf, scoring='neg_mean_squared_error')

# Converting scores to positive MSE values
mse_scores = -cv_scores

# Calculating the average MSE and the 10th and 90th percentiles
average_mse = np.mean(mse_scores)
percentile_10 = np.percentile(mse_scores, 10)
percentile_90 = np.percentile(mse_scores, 90)

print(f"Average MSE: {average_mse:.4f}")
print(f"10th Percentile of MSE: {percentile_10:.4f}")
print(f"90th Percentile of MSE: {percentile_90:.4f}")


Train and Test Errors for Different C Values

In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(
        data_processed.iloc[:-lag, :-1], data_processed.iloc[:-lag, -1], test_size=0.2)

# Range of C values
C_values = np.linspace(1, 50, 100)

# Initialize lists to store the errors
train_errors = []
test_errors = []

# Loop over the range of C values
for C in C_values:
    model = SVR(C=C, epsilon=0.05)
    model.fit(X_train, y_train)
    
    # Predict and calculate MSE for training and testing sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_errors.append(mean_squared_error(y_train, y_train_pred))
    test_errors.append(mean_squared_error(y_test, y_test_pred))

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(C_values, train_errors, label='Train Error')
plt.plot(C_values, test_errors, label='Test Error')
plt.xlabel('C value')
plt.ylabel('Mean Squared Error')
plt.title('Train and Test MSE for Different C Values in SVR')
plt.legend()
plt.show()

Train and test errors for different C and epsilon values

In [None]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(
        data_processed.iloc[:-lag, :-1], data_processed.iloc[:-lag, -1], test_size=0.2)

# Define ranges for C and epsilon
C_range = np.linspace(1, 50, 10)
epsilon_range = np.linspace(0.01, 0.5, 10)
test_errors = np.zeros((len(C_range), len(epsilon_range)))

# Loop over the ranges of C and epsilon to train models and record the test error
for i, C in enumerate(C_range):
    for j, epsilon in enumerate(epsilon_range):
        model = SVR(C=C, epsilon=epsilon)
        model.fit(X_train, y_train)  # Using X_train without scaling as per your instruction
        y_pred = model.predict(X_test)
        test_errors[i, j] = mean_squared_error(y_test, y_pred)

# Generate the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(test_errors, xticklabels=np.round(epsilon_range, 2), yticklabels=np.round(C_range, 2), annot=True, fmt=".2f", cmap='viridis')
plt.title('Test MSE for Different Levels of C and Epsilon in SVR')
plt.xlabel('Epsilon')
plt.ylabel('C')
plt.show()

In [173]:
prediction_data = data_processed.iloc[-lag:, :-1]

# Initialize the SVR model with specific parameters
model = SVR(C=20, epsilon=0.05)

# Preparing the data
X = data_processed.iloc[:-lag, :-1]  # input features
y = data_processed.iloc[:-lag, -1]   # target variable

# Train the model using the training dataset
model.fit(X, y)

predicted_cpi = model.predict(prediction_data)

predicted_cpi_inverse = scaler.inverse_transform(predicted_cpi)

predicted_cpi_inverse




ValueError: Expected 2D array, got 1D array instead:
array=[1.90016639 1.55525751 1.90747421].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.