In [78]:
import numpy as np
import pandas as pd

In [79]:
data_cleaned = pd.read_csv("Data/predictions.csv")

data_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Person,Current Age,Gender,State,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,TotalAmount,NumberOfTransactions,AverageTransactionAmount
0,0,Hazel Robinson,53,Female,CA,$29278,$59696,$127613,787,5,1620374.41,19936,81.278813
1,1,Sasha Sadr,53,Female,NY,$37891,$77254,$191349,701,5,716298.26,8886,80.609752
2,2,Saanvi Lee,81,Female,CA,$22681,$33483,$196,698,5,1471585.6,41934,35.092898
3,3,Everlee Clark,63,Female,NY,$163145,$249925,$202328,722,4,1183826.92,10097,117.245412
4,4,Kyle Peterson,43,Male,CA,$53797,$109687,$183855,675,1,1798746.67,18540,97.019777


In [80]:
df = data_cleaned.drop(["TotalAmount", "NumberOfTransactions", "Person"], axis=1)
df['Total Debt'] = df['Total Debt'].str.replace('$', '').astype(float)
df['Per Capita Income - Zipcode'] = df['Per Capita Income - Zipcode'].str.replace('$', '').astype(float)
df['Yearly Income - Person'] = df['Yearly Income - Person'].str.replace('$', '').astype(float)
df

Unnamed: 0.1,Unnamed: 0,Current Age,Gender,State,Per Capita Income - Zipcode,Yearly Income - Person,Total Debt,FICO Score,Num Credit Cards,AverageTransactionAmount
0,0,53,Female,CA,29278.0,59696.0,127613.0,787,5,81.278813
1,1,53,Female,NY,37891.0,77254.0,191349.0,701,5,80.609752
2,2,81,Female,CA,22681.0,33483.0,196.0,698,5,35.092898
3,3,63,Female,NY,163145.0,249925.0,202328.0,722,4,117.245412
4,4,43,Male,CA,53797.0,109687.0,183855.0,675,1,97.019777
...,...,...,...,...,...,...,...,...,...,...
1995,1995,32,Male,NY,23550.0,48010.0,87837.0,703,3,23.655819
1996,1996,62,Female,KY,24218.0,49378.0,104480.0,740,4,40.897092
1997,1997,47,Female,NJ,15175.0,30942.0,71066.0,779,3,32.546150
1998,1998,66,Male,PA,25336.0,54654.0,27241.0,618,1,58.956214


# 2. Decision Trees

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Features and target variable
X = df.drop('AverageTransactionAmount', axis=1)
y = df['AverageTransactionAmount']

# List of numeric and categorical features
numeric_features = ['Current Age', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt', 'FICO Score', 'Num Credit Cards']
categorical_features = ['Gender', 'State']

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the decision tree model as part of a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'Test Mean Squared Error (MSE): {mse:.2f}')
print(f'Test Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'Test Mean Absolute Percentage Error (MAPE): {mape:.2f}%')

Test Mean Squared Error (MSE): 409.90
Test Root Mean Squared Error (RMSE): 20.25
Test Mean Absolute Percentage Error (MAPE): 39.68%


In [82]:
# Display some predictions
print("\nSample Predictions:")
for i in range(5):
    print(f"Predicted: {y_pred[i]:.2f}, Actual: {y_test.iloc[i]:.2f}")

# Feature importance
feature_importance = model.named_steps['regressor'].feature_importances_

# Update for get_feature_names_out method for compatibility
feature_names = (numeric_features + 
                 model.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .get_feature_names_out(categorical_features).tolist())

# Sort features by importance
feature_importance_sorted = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance_sorted:
    print(f"{feature}: {importance:.4f}")


Sample Predictions:
Predicted: 44.19, Actual: 29.54
Predicted: 42.70, Actual: 25.14
Predicted: 20.16, Actual: 71.89
Predicted: 44.64, Actual: 81.82
Predicted: 77.78, Actual: 32.21

Feature Importance:
Per Capita Income - Zipcode: 0.3555
Total Debt: 0.1197
FICO Score: 0.1142
Yearly Income - Person: 0.1125
Current Age: 0.0842
Num Credit Cards: 0.0377
Gender_Male: 0.0151
State_NC: 0.0133
State_FL: 0.0106
State_MI: 0.0103
State_CA: 0.0086
State_PA: 0.0075
State_GA: 0.0070
State_OH: 0.0068
State_AL: 0.0062
State_AZ: 0.0062
State_TN: 0.0055
State_KS: 0.0053
State_NJ: 0.0053
State_IL: 0.0047
State_MD: 0.0047
State_WI: 0.0043
State_TX: 0.0043
State_CO: 0.0043
State_NY: 0.0034
State_IN: 0.0032
State_LA: 0.0031
State_AR: 0.0030
State_MA: 0.0028
State_ID: 0.0028
State_MS: 0.0026
State_KY: 0.0026
State_VA: 0.0024
State_WA: 0.0022
State_OR: 0.0020
State_MN: 0.0020
State_IA: 0.0019
State_SC: 0.0019
State_OK: 0.0019
State_DE: 0.0016
State_SD: 0.0014
State_DC: 0.0009
State_HI: 0.0009
State_WV: 0.0008

In [83]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Features and target variable
X = df.drop('AverageTransactionAmount', axis=1)
y = df['AverageTransactionAmount']

# List of numeric and categorical features
numeric_features = ['Current Age', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt', 'FICO Score', 'Num Credit Cards']
categorical_features = ['Gender', 'State']

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')  # Ensure output is dense

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the decision tree model as part of a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score from the grid search
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (MSE):", -grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'Test Mean Squared Error (MSE): {mse:.2f}')
print(f'Test Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'Test Mean Absolute Percentage Error (MAPE): {mape:.2f}%')


Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 10, 'regressor__min_samples_split': 10}
Best CV Score (MSE): 265.9896190459667
Test Mean Squared Error (MSE): 239.23
Test Root Mean Squared Error (RMSE): 15.47
Test Mean Absolute Percentage Error (MAPE): 33.50%




# 3. Random Forest

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Features and target variable
X = df.drop('AverageTransactionAmount', axis=1)
y = df['AverageTransactionAmount']

# List of numeric and categorical features
numeric_features = ['Current Age', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt', 'FICO Score', 'Num Credit Cards']
categorical_features = ['Gender', 'State']

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the random forest model as part of a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'Test Mean Squared Error (MSE): {mse:.2f}')
print(f'Test Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'Test Mean Absolute Percentage Error (MAPE): {mape:.2f}%')



Test Mean Squared Error (MSE): 208.15
Test Root Mean Squared Error (RMSE): 14.43
Test Mean Absolute Percentage Error (MAPE): 31.90%


In [85]:
# Display some predictions
print("\nSample Predictions:")
for i in range(5):
    print(f"Predicted: {y_pred[i]:.2f}, Actual: {y_test.iloc[i]:.2f}")

# Feature importance
feature_importance = model.named_steps['regressor'].feature_importances_
feature_names = (numeric_features + 
                 model.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .get_feature_names_out(categorical_features).tolist())

# Sort features by importance
feature_importance_sorted = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for feature, importance in feature_importance_sorted:
    print(f"{feature}: {importance:.4f}")


Sample Predictions:
Predicted: 36.70, Actual: 29.54
Predicted: 37.31, Actual: 25.14
Predicted: 61.91, Actual: 71.89
Predicted: 49.94, Actual: 81.82
Predicted: 38.94, Actual: 32.21

Feature Importance:
Per Capita Income - Zipcode: 0.3537
Yearly Income - Person: 0.1219
FICO Score: 0.1144
Total Debt: 0.1032
Current Age: 0.0906
Num Credit Cards: 0.0424
Gender_Male: 0.0129
State_CA: 0.0108
State_MI: 0.0099
State_PA: 0.0094
State_NY: 0.0091
State_FL: 0.0080
State_TX: 0.0079
State_IL: 0.0076
State_NC: 0.0075
State_AZ: 0.0065
State_GA: 0.0064
State_WA: 0.0057
State_OH: 0.0054
State_CO: 0.0047
State_NJ: 0.0046
State_AL: 0.0044
State_VA: 0.0037
State_KS: 0.0035
State_MD: 0.0031
State_ID: 0.0028
State_IN: 0.0028
State_MO: 0.0026
State_TN: 0.0025
State_MN: 0.0024
State_NV: 0.0024
State_MA: 0.0024
State_WI: 0.0023
State_SC: 0.0021
State_LA: 0.0020
State_OR: 0.0018
State_SD: 0.0018
State_HI: 0.0016
State_AR: 0.0015
State_CT: 0.0014
State_KY: 0.0012
State_VT: 0.0012
State_MS: 0.0011
State_OK: 0.0011

In [86]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and target variable
X = df.drop('AverageTransactionAmount', axis=1)
y = df['AverageTransactionAmount']

# List of numeric and categorical features
numeric_features = ['Current Age', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 'Total Debt', 'FICO Score', 'Num Credit Cards']
categorical_features = ['Gender', 'State']

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')  # Ensure output is dense

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the random forest model as part of a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10]
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score from the grid search
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (MSE):", -grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'Test Mean Squared Error (MSE): {mse:.2f}')
print(f'Test Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'Test Mean Absolute Percentage Error (MAPE): {mape:.2f}%')


Fitting 5 folds for each of 72 candidates, totalling 360 fits




Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 5, 'regressor__min_samples_split': 20, 'regressor__n_estimators': 200}
Best CV Score (MSE): 229.85019405442003
Test Mean Squared Error (MSE): 206.21
Test Root Mean Squared Error (RMSE): 14.36
Test Mean Absolute Percentage Error (MAPE): 31.49%


