In [None]:
import sys
print(sys.executable)

In [None]:
# dependecies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import random
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
# for all my plots
palette = ['#C66F80', '#F4C7D0','#FCEBF1', '#4A6644', '#9FAA74', '#D7DAB3', '#ECE3D2']
customCmap = ListedColormap(palette)

In [None]:
data = pd.read_csv('personalFinanceDataset.csv')
data.head()

In [None]:
data.info()

In [None]:
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.shape

In [None]:
# making all categorical values numerical => correlation table (one hot encoding)
ohe_gender = pd.get_dummies(data.gender)
ohe_raceEthnicity = pd.get_dummies(data.raceEthnicity)
ohe_education = pd.get_dummies(data.education)
ohe_region = pd.get_dummies(data.region)
ohe_householdType = pd.get_dummies(data.householdType)
ohe_career = pd.get_dummies(data.career)
ohe_workArrangement = pd.get_dummies(data.workArrangement)
ohe_housingStatus = pd.get_dummies(data.housingStatus)

In [None]:
# the previous one hot encoding messes with the one with the synthetic data
# will probably rework the entire notebook to align and be easier to go back and read
data = pd.concat([data, ohe_gender, ohe_career, ohe_education, ohe_householdType, ohe_housingStatus, ohe_raceEthnicity, ohe_region, ohe_workArrangement], axis='columns')
data.head()

writing down some things i want to explore
 1. want to see correlations in different categories
 - more specific one : debt & discretionary
 2. want to see feature importance (which feature has the biggest impact on financial status)
 3. think wanted to learn xg boost model - def want to build a model anyway to simulate real people
 - wanna experiment on lesser features used to train; bc not everyone would have all these numbers ready to go
 4. are there any instances that break the original proportions

In [None]:
df = data[['studentLoans', 'entertainment', 'hobbies', 'travel', 'gifts', 'donations', 'ccPayment', 'personalLoans', 'medicalDebt']].copy()
df.head()

In [None]:
correlationMatrix = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlationMatrix, annot=True, cmap=customCmap, 
            fmt='.1f', square=True, linewidths=0.5)
plt.title('Correlation Matrix between Debt & Discretionary Spending',)
plt.tight_layout()
plt.show()

In [None]:
# creating some cross tables
crosstab01 = pd.crosstab(data['career'], data['financialHealth'])
crosstab02 = pd.crosstab(data['financialHealth'], data['householdType'])
crosstab03 = pd.crosstab(data['ageCategory'], data['career'])
crosstab04 = pd.crosstab(data['financialHealth'], data['career'])
crosstab04

In [None]:
crosstab02.plot(kind='bar', stacked=True, colormap=customCmap, figsize=(12, 6))

In [None]:
sns.countplot(data=data, x='financialHealth')
plt.title('Distribution of Financial Health')
plt.xticks(rotation=90)
plt.show()

MODELING 
1. xg boost 2. knn 3. gan

In [None]:
# about to begin process of using xgboost vastly for feature importance data
# must first label encode y variable
le = LabelEncoder()
y = le.fit_transform(data['financialHealth']) # works for the target variable, not for features

In [None]:
data.select_dtypes(include='object').info()

In [None]:
# test train split
x = data.drop(['gender','raceEthnicity', 'education', 'region', 'householdType',
              'career', 'workArrangement', 'housingStatus', 'financialHealth',
              'ageCategory'], axis=1)
featureNames = x.columns
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x

In [None]:
yTrain = yTrain.flatten()
yTrain.shape

In [None]:
xTrain = xTrain.astype('float32').values
xTest = xTest.astype('float32').values
yTrain = yTrain.astype('int')

In [None]:
print('Original dataset: %d training samples & %d testing samples\n' % (
    len(xTrain), len(xTest)
))

In [None]:
# model
model = XGBClassifier(random_state=42, n_estimators=100, max_depth=4)
model.fit(xTrain, yTrain)

In [None]:
# evaluate
yPred = model.predict(xTest)
print(classification_report(yTest, yPred))

In [None]:
print(le.classes_)

In [None]:
# confusion matrix
cm = confusion_matrix(yTest, yPred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap=customCmap, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix', fontsize = 16, fontweight = 'bold')
plt.xlabel('Predicted Label', fontsize = 12)
plt.ylabel('True Label', fontsize = 12)
plt.tight_layout()

In [None]:
# confusion matrix w/ percentages
cm = confusion_matrix(yTest, yPred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm / np.sum(cm), annot=True, fmt='.2%', cmap=customCmap, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix', fontsize = 16, fontweight = 'bold')
plt.xlabel('Predicted Label', fontsize = 12)
plt.ylabel('True Label', fontsize = 12)
plt.tight_layout()

In [None]:
# feature importance
xgb.plot_importance(model, max_num_features=10)
plt.show()

In [None]:
# Get feature importances as a dictionary
importances = model.get_booster().get_score(importance_type='weight')

# Match them to the real column names
feature_map = {f"f{i}": col for i, col in enumerate(featureNames)}

# Translate importances
named_importances = {feature_map.get(k, k): v for k, v in importances.items()}

# Sort and view
import pandas as pd
pd.Series(named_importances).sort_values(ascending=False)

i want to do a KNN model & test out the differences in my synthetic data made using a python script vs using a GAN and seeing the similarities & differences in the XG Boost & KNN models

In [None]:
# test train split
x = data[['monthsSaved', 'housingRatio', 'savingsInvestmentsRate', 'monthlyCashFlow',
          'debtToIncome', 'totalSavingsInvestments', 'carPaymentRatio', 'homeInsurance',
          'streaming', 'clothing', 'healthInsurance', 'alcohol', 'oopMedical',
          'coffee', 'ccPayment', 'hobbies', 'travel', 'householdSupplies']].copy() # had to reduce the # of features bc of knn dimensionality issues
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

In [None]:
kRange = range(5, 10)
cvScores = []

for k in kRange:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, xTrain, yTrain, cv=5, scoring='f1_macro')
    cvScores.append(scores.mean())

In [None]:
plt.plot(kRange, cvScores, marker='o')
plt.xlabel('k')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Optimal k for KNN')
plt.grid(True)
plt.show()

best_k = kRange[cvScores.index(max(cvScores))]
print(f"Best k: {best_k}")

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(xTrain, yTrain)

In [None]:
yPred = knn.predict(xTest)

In [None]:
# evaluate
print(classification_report(yTest, yPred))

In [None]:
# confusion matrix
cm = confusion_matrix(yTest, yPred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap=customCmap, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix', fontsize = 16, fontweight = 'bold')
plt.xlabel('Predicted Label', fontsize = 12)
plt.ylabel('True Label', fontsize = 12)
plt.tight_layout()

train on real, test on GAN
train on gan, test on real

In [None]:
data.describe()

In [None]:
synData = pd.read_csv('synData.csv')

In [None]:
synData.head()

In [None]:
synData.describe()

In [None]:
syndf = synData[['monthlyCashFlow', 'debtToIncome', 'housingCost', 'personalCare', 'phone',
                 'groceries']].copy()
syndf.head()

In [None]:
cm = syndf.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, cmap=customCmap, 
            fmt='.1f', square=True, linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# quick distribution of synthetic data
sns.countplot(data=synData, x='financialHealth')
plt.title('Distribution of Financial Health')
plt.xticks(rotation=90)
plt.show()

COMPARING DATASETS MODELING 

In [None]:
catCols = data.select_dtypes(include=['object', 'boolean']).columns.tolist()

In [None]:
catCols

In [None]:
# Diagnostic code - run this first to identify the problem
print("Original data columns:", data.columns.tolist())
print("Synthetic data columns:", synData.columns.tolist())
print("\nDuplicate columns in original data:", data.columns[data.columns.duplicated()].tolist())
print("Duplicate columns in synthetic data:", synData.columns[synData.columns.duplicated()].tolist())

In [None]:
# removes duplicate columns
dataClean = data.reset_index(drop=True).copy()
synDataclean = synData.reset_index(drop=True).copy()

dataClean = dataClean.loc[:, ~dataClean.columns.duplicated()]
synDataclean = synDataclean.loc[:, ~synDataclean.columns.duplicated()]

# verify both have the same columns
print('Data columns:', dataClean.columns.tolist())
print('Synthetic Data columns:', synDataclean.columns.tolist())

# if columns don't match, align them
commonCols = dataClean.columns.intersection(synDataclean.columns)
dataClean = dataClean[commonCols]
synDataclean = synDataclean[commonCols]

# concatenate for OHE
full = pd.concat([dataClean, synDataclean], ignore_index=True)

# OHE
dummies = pd.get_dummies(full, columns=catCols, prefix=catCols)

# split back into data & synData
data = dummies.iloc[:len(dataClean)].copy()
synData = dummies.iloc[len(dataClean):].copy()

In [None]:
synData.head()

need to add the one hot encoded features on the synthetic data

In [None]:
# data split
synX = synData.drop(['gender', 'raceEthnicity', 'education', 'region', 'householdType',
              'career', 'workArrangement', 'housingStatus', 'financialHealth',
              'ageCategory'], axis=1)
synXtrain, synXtest, synYtrain, synYtest = train_test_split(synX, synY, test_size=0.2, random_state=42)

In [None]:
# need to ensure shape is proper for xg boost model
synYtrain = synYtrain.flatten()
synYtrain.shape

In [None]:
synXtrain = synXtrain.astype('float32').values
synXtest = synXtest.astype('float32').values
synYtrain = synYtrain.astype('int')

In [None]:
# model (trained with syn data)
synModel = XGBClassifier(random_state=42, n_estimators=100, max_depth=4)
synModel.fit(synXtrain, synYtrain)

In [None]:
# evaluate (syn data on syn model)
synYpred = synModel.predict(synXtest)
print(classification_report(synYtest, synYpred))

In [None]:
cm = confusion_matrix(synYtest, synYpred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap=customCmap, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix', fontsize = 16, fontweight = 'bold')
plt.xlabel('Predicted Label', fontsize = 12)
plt.ylabel('True Label', fontsize = 12)
plt.tight_layout()