# <center>Model Building</center>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [2]:
df = pd.read_csv("final_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Income,Age,Occupation,City_Tier,Loan_Repayment,Fixed Expenses,Investment,Savings,Shopping_Frequency,Hobbies,Class
0,44637.0,49,Self_Employed,Tier_1,0.0,33371.621929,5918.45,5347.18,8,"Cooking, Baking",Neutral
1,26859.0,34,Retired,Tier_2,0.0,17181.777859,2021.16,7655.66,4,"Cooking, Baking",Saver
2,50368.0,35,Student,Tier_3,4612.1,31864.051073,6762.07,7129.38,7,"Video Games, Baking",Neutral
3,101456.0,21,Self_Employed,Tier_3,6809.44,63028.205206,15974.01,15643.94,15,"Baking, Cooking",Spender
4,24875.0,52,Professional,Tier_2,3112.61,15496.973617,1874.66,4391.04,4,"Gardening, Cooking",Neutral


In [4]:
le_occupation = LabelEncoder()
le_city = LabelEncoder()
le_class = LabelEncoder()

In [5]:
df['Occupation'] = le_occupation.fit_transform(df['Occupation'])
df['City_Tier'] = le_city.fit_transform(df['City_Tier'])
df['Class'] = le_class.fit_transform(df['Class'])

In [6]:
df.head()

Unnamed: 0,Income,Age,Occupation,City_Tier,Loan_Repayment,Fixed Expenses,Investment,Savings,Shopping_Frequency,Hobbies,Class
0,44637.0,49,2,0,0.0,33371.621929,5918.45,5347.18,8,"Cooking, Baking",0
1,26859.0,34,1,1,0.0,17181.777859,2021.16,7655.66,4,"Cooking, Baking",1
2,50368.0,35,3,2,4612.1,31864.051073,6762.07,7129.38,7,"Video Games, Baking",0
3,101456.0,21,2,2,6809.44,63028.205206,15974.01,15643.94,15,"Baking, Cooking",2
4,24875.0,52,0,1,3112.61,15496.973617,1874.66,4391.04,4,"Gardening, Cooking",0


In [7]:
df['Hobbies'] = df['Hobbies'].str.split(',')

In [8]:
df["Hobbies"].head()

0        [Cooking,  Baking]
1        [Cooking,  Baking]
2    [Video Games,  Baking]
3        [Baking,  Cooking]
4     [Gardening,  Cooking]
Name: Hobbies, dtype: object

In [9]:
df['Hobbies'] = df['Hobbies'].apply(lambda x: [h.strip() for h in x])

In [10]:
mlb = MultiLabelBinarizer()
hobby_encoded = pd.DataFrame(mlb.fit_transform(df['Hobbies']), columns=mlb.classes_, index=df.index)

In [11]:
df = pd.concat([df.drop(columns=['Hobbies']), hobby_encoded], axis=1)

In [12]:
df.head()

Unnamed: 0,Income,Age,Occupation,City_Tier,Loan_Repayment,Fixed Expenses,Investment,Savings,Shopping_Frequency,Class,...,Fitness,Gaming,Gardening,Photography,Reading,Social Media,Sports,Travel,Video Games,Writing
0,44637.0,49,2,0,0.0,33371.621929,5918.45,5347.18,8,0,...,0,0,0,0,0,0,0,0,0,0
1,26859.0,34,1,1,0.0,17181.777859,2021.16,7655.66,4,1,...,0,0,0,0,0,0,0,0,0,0
2,50368.0,35,3,2,4612.1,31864.051073,6762.07,7129.38,7,0,...,0,0,0,0,0,0,0,0,1,0
3,101456.0,21,2,2,6809.44,63028.205206,15974.01,15643.94,15,2,...,0,0,0,0,0,0,0,0,0,0
4,24875.0,52,0,1,3112.61,15496.973617,1874.66,4391.04,4,0,...,0,0,1,0,0,0,0,0,0,0


In [13]:
df.columns

Index(['Income', 'Age', 'Occupation', 'City_Tier', 'Loan_Repayment',
       'Fixed Expenses', 'Investment', 'Savings', 'Shopping_Frequency',
       'Class', 'Baking', 'Cooking', 'Electronics', 'Fitness', 'Gaming',
       'Gardening', 'Photography', 'Reading', 'Social Media', 'Sports',
       'Travel', 'Video Games', 'Writing'],
      dtype='object')

In [14]:
le_occupation.classes_

array(['Professional', 'Retired', 'Self_Employed', 'Student'],
      dtype=object)

In [15]:
le_city.classes_

array(['Tier_1', 'Tier_2', 'Tier_3'], dtype=object)

In [16]:
le_class.classes_

array(['Neutral', 'Saver', 'Spender'], dtype=object)

In [17]:
# Split into features and target
X = df.drop(columns=['Class'])
y = df['Class']

In [18]:
numerical_cols = ['Income', 'Age', 'Loan_Repayment', 'Fixed Expenses', 'Investment', 'Savings', 'Shopping_Frequency']

scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
model.score(X_train,y_train)

1.0

In [22]:
model.score(X_test,y_test)

0.955

In [23]:
y_pred = model.predict(X_test)

In [24]:
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=le_class.classes_))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

✅ Accuracy: 0.955

📊 Classification Report:
               precision    recall  f1-score   support

     Neutral       0.95      0.96      0.96      1841
       Saver       0.92      0.90      0.91       869
     Spender       0.98      0.98      0.98      1290

    accuracy                           0.95      4000
   macro avg       0.95      0.95      0.95      4000
weighted avg       0.95      0.95      0.95      4000


🧩 Confusion Matrix:
 [[1768   70    3]
 [  69  782   18]
 [  18    2 1270]]


In [25]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"🔹 {name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=le_class.classes_))

🔹 Logistic Regression Accuracy: 0.8715
              precision    recall  f1-score   support

     Neutral       0.84      0.90      0.87      1841
       Saver       0.93      0.94      0.94       869
     Spender       0.88      0.79      0.83      1290

    accuracy                           0.87      4000
   macro avg       0.88      0.87      0.88      4000
weighted avg       0.87      0.87      0.87      4000

🔹 XGBoost Accuracy: 0.9828
              precision    recall  f1-score   support

     Neutral       0.98      0.98      0.98      1841
       Saver       0.97      0.97      0.97       869
     Spender       0.99      0.99      0.99      1290

    accuracy                           0.98      4000
   macro avg       0.98      0.98      0.98      4000
weighted avg       0.98      0.98      0.98      4000

🔹 Random Forest Accuracy: 0.9550
              precision    recall  f1-score   support

     Neutral       0.95      0.96      0.96      1841
       Saver       0.92      0

In [26]:
model

In [27]:
#save model
joblib.dump(model,"model/random_foest_model.pkl")

['model/random_foest_model.pkl']

In [28]:
# Save encoders
joblib.dump(le_occupation, "model/label_encoder_occupation.pkl")
joblib.dump(le_city, "model/label_encoder_city_tier.pkl")
joblib.dump(le_class, "model/label_encoder_class.pkl")

['model/label_encoder_class.pkl']

In [29]:
# Save scaler
joblib.dump(scaler, "model/standard_scaler.pkl")

['model/standard_scaler.pkl']