### Liberaries importing

In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from  category_encoders import OneHotEncoder,HashingEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle

### Data importing

In [45]:
df= pd.read_csv('preprocess_df.csv',index_col=0)

In [46]:
df.head()

Unnamed: 0,Gender,Age,Average Monthly Spend on Entertainment,Number of Online Purchases in Last Month,Average Weekly Exercise Hours,Investment Portfolio Value,Health Consciousness Rating,Education Level,Average Daily Screen Time,Environmental Awareness Rating,...,Work-Life Balance Indicator,Investment Risk Appetite,Eco-Consciousness Metric,Stress Management Score,Time Management Skill,Lifestyle Choice,State,Country,Donation Class,Training Category
0,M,49,40.4767,97.2243,0.0541,294.4833,0.6917,7.9403,6.6903,3.4319,...,0.0088,7.2593,0.0152,5.1301,89.3802,Eco-Friendly,Tasmania,Australia,Single Donation,Moderate Trainings 3-4
1,M,43,9.2237,82.4243,0.6813,19.7512,5.5313,11.797,7.6367,0.5887,...,0.4631,6.8483,0.0,4.033,100.6926,Adventure Seeker,Lapland,Finland,Single Donation,High Trainings 6-11
2,F,37,7.9816,163.5799,0.0398,217.2924,0.7065,0.8247,4.3922,3.7221,...,0.0136,0.0,0.0,3.0287,31.9631,Urban Professional,Nunavut,Canada,No Donation,Moderate Trainings 3-4
3,M,22,7.0638,128.3473,0.8181,2.3693,0.7348,12.8065,3.9565,2.0169,...,0.8671,4.1594,0.0035,2.0505,32.1793,Budget-Conscious,Drenthe,Netherlands,No Donation,Moderate Trainings 3-4
4,M,38,7.6343,126.6422,0.0,2.4449,2.1085,3.2717,6.5593,0.6248,...,0.0269,6.7901,0.4293,4.7058,27.9519,Health-Conscious,Queensland,Australia,Occational Donation <= 3,Moderate Trainings 3-4


### Target/features spliting

In [47]:
y = df['Lifestyle Choice']
X= df.drop(columns=['Lifestyle Choice'])
print(y.shape)
print(X.shape)

(475675,)
(475675, 25)


### Categorical variables encoding

In [48]:
df.select_dtypes(include='object').nunique()

Gender                 2
Lifestyle Choice      12
State                664
Country               21
Donation Class         4
Training Category      4
dtype: int64

#### Encoding algorithm
- `Gender`: with only 2 unique values, LaberEncoder will be effcient as it will assign 0 and 1 to the 2 categories.
- `Donation Class` and `Training Category`: as they have ordinal classes, LabelEncoder will be effcient as it will assign 0,1,2,3 to each class.
- `Country`: it has 35 values so OHE algrithm should do the work.
- `State`: if we used OHE will 666 values it will create 666 new features which is alot, so we will use Feature Hashing because it is computationally efficient, and it handles high-cardinality features well without creating an excessive number of new columns. 

In [49]:
# Encoding Gender, Donation Class, and Training Category using LabelEncoder
# Initializing LabelEncoder
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])
X['Donation Class'] = le.fit_transform(X['Donation Class'])
X['Training Category'] = le.fit_transform(X['Training Category'])


In [50]:
X['Gender'].unique()

array([1, 0])

In [51]:
X['Donation Class'].unique()

array([3, 0, 1, 2])

In [52]:
X['Training Category'].unique()

array([2, 1, 0, 3])

In [53]:
# Encoding Country using One-Hot Encoder
ohe = OneHotEncoder(use_cat_names=True,cols=['Country'])
X_encoded = ohe.fit_transform(X)
X_encoded.head()

Unnamed: 0,Gender,Age,Average Monthly Spend on Entertainment,Number of Online Purchases in Last Month,Average Weekly Exercise Hours,Investment Portfolio Value,Health Consciousness Rating,Education Level,Average Daily Screen Time,Environmental Awareness Rating,...,Country_Spain,Country_France,Country_Denmark,Country_India,Country_Serbia,Country_Ukraine,Country_Turkey,Country_Iran,Donation Class,Training Category
0,1,49,40.4767,97.2243,0.0541,294.4833,0.6917,7.9403,6.6903,3.4319,...,0,0,0,0,0,0,0,0,3,2
1,1,43,9.2237,82.4243,0.6813,19.7512,5.5313,11.797,7.6367,0.5887,...,0,0,0,0,0,0,0,0,3,1
2,0,37,7.9816,163.5799,0.0398,217.2924,0.7065,0.8247,4.3922,3.7221,...,0,0,0,0,0,0,0,0,0,2
3,1,22,7.0638,128.3473,0.8181,2.3693,0.7348,12.8065,3.9565,2.0169,...,0,0,0,0,0,0,0,0,0,2
4,1,38,7.6343,126.6422,0.0,2.4449,2.1085,3.2717,6.5593,0.6248,...,0,0,0,0,0,0,0,0,1,2


In [54]:
X_encoded.columns

Index(['Gender', 'Age', 'Average Monthly Spend on Entertainment',
       'Number of Online Purchases in Last Month',
       'Average Weekly Exercise Hours', 'Investment Portfolio Value',
       'Health Consciousness Rating', 'Education Level',
       'Average Daily Screen Time', 'Environmental Awareness Rating',
       'Social Media Influence Score', 'Risk Tolerance in Investments',
       'Tech-Savviness Score', 'Financial Wellness Index',
       'Lifestyle Balance Score', 'Social Responsibility Index',
       'Work-Life Balance Indicator', 'Investment Risk Appetite',
       'Eco-Consciousness Metric', 'Stress Management Score',
       'Time Management Skill', 'State', 'Country_Australia',
       'Country_Finland', 'Country_Canada', 'Country_Netherlands',
       'Country_Switzerland', 'Country_Brazil', 'Country_Norway',
       'Country_United States', 'Country_Germany', 'Country_Mexico',
       'Country_United Kingdom', 'Country_Ireland', 'Country_New Zealand',
       'Country_Spain',

In [55]:
# Encoding State using HashingEncoder
n_components_state = int(np.ceil(np.sqrt(df['State'].nunique())))
he_state = HashingEncoder(cols=['State'], n_components=n_components_state)
state_encoded = he_state.fit_transform(X_encoded['State'])

In [56]:
state_encoded.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_13', 'col_14',
       'col_15', 'col_16', 'col_17', 'col_18', 'col_19', 'col_20', 'col_21',
       'col_22', 'col_23', 'col_24', 'col_25'],
      dtype='object')

In [57]:
X_encoded = pd.concat([X_encoded.drop(['State'], axis=1), state_encoded], axis=1)
X_encoded.head()

Unnamed: 0,Gender,Age,Average Monthly Spend on Entertainment,Number of Online Purchases in Last Month,Average Weekly Exercise Hours,Investment Portfolio Value,Health Consciousness Rating,Education Level,Average Daily Screen Time,Environmental Awareness Rating,...,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25
0,1,49,40.4767,97.2243,0.0541,294.4833,0.6917,7.9403,6.6903,3.4319,...,0,0,0,0,0,0,0,0,0,0
1,1,43,9.2237,82.4243,0.6813,19.7512,5.5313,11.797,7.6367,0.5887,...,0,0,0,0,0,1,0,0,0,0
2,0,37,7.9816,163.5799,0.0398,217.2924,0.7065,0.8247,4.3922,3.7221,...,0,0,0,0,0,0,0,0,0,0
3,1,22,7.0638,128.3473,0.8181,2.3693,0.7348,12.8065,3.9565,2.0169,...,0,0,0,0,0,0,0,0,0,0
4,1,38,7.6343,126.6422,0.0,2.4449,2.1085,3.2717,6.5593,0.6248,...,0,0,0,0,0,1,0,0,0,0


#### Feature Encoding

- As the model algorithm we choose - Random Forest - doesn't handle categorical targets, we need to encode our target.
- We will use labelencoder as a simple algorithm, because Random Forest doesn't assume any ordinal relationship between classes, so the specific integer values don't matter as long as they're consistent.


In [58]:
y.nunique()

12

In [59]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
set(y_encoded)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}

### Train-Test Split

In [60]:
X_train,X_test, y_train,y_test = train_test_split(X_encoded,y_encoded,test_size=0.3,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(332972, 70)
(332972,)
(142703, 70)
(142703,)


### Train the model

In [61]:
#Instantiate the model
model = RandomForestClassifier(random_state=42)

#Fit the data
model.fit(X_train,y_train)

#Make predictions

pred_y = model.predict(X_test)

In [62]:
print("Accuracy:", accuracy_score(y_test, pred_y))
print("\nClassification Report:\n", classification_report(y_test, pred_y))

Accuracy: 0.6896771616574283

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.62     11852
           1       0.47      0.33      0.39     12130
           2       0.78      0.79      0.79     11832
           3       0.79      0.72      0.76     12049
           4       0.66      0.72      0.69     11941
           5       0.56      0.59      0.58     11928
           6       0.66      0.71      0.69     11958
           7       0.78      0.75      0.77     11807
           8       0.85      0.91      0.88     11858
           9       0.83      0.87      0.85     11719
          10       0.73      0.73      0.73     11774
          11       0.51      0.50      0.50     11855

    accuracy                           0.69    142703
   macro avg       0.69      0.69      0.69    142703
weighted avg       0.69      0.69      0.69    142703



Ok! Our model is working but with 0.68% accuracy, there is a room for improvment.

### Feature importance

In [63]:
X_train.columns

Index(['Gender', 'Age', 'Average Monthly Spend on Entertainment',
       'Number of Online Purchases in Last Month',
       'Average Weekly Exercise Hours', 'Investment Portfolio Value',
       'Health Consciousness Rating', 'Education Level',
       'Average Daily Screen Time', 'Environmental Awareness Rating',
       'Social Media Influence Score', 'Risk Tolerance in Investments',
       'Tech-Savviness Score', 'Financial Wellness Index',
       'Lifestyle Balance Score', 'Social Responsibility Index',
       'Work-Life Balance Indicator', 'Investment Risk Appetite',
       'Eco-Consciousness Metric', 'Stress Management Score',
       'Time Management Skill', 'Country_Australia', 'Country_Finland',
       'Country_Canada', 'Country_Netherlands', 'Country_Switzerland',
       'Country_Brazil', 'Country_Norway', 'Country_United States',
       'Country_Germany', 'Country_Mexico', 'Country_United Kingdom',
       'Country_Ireland', 'Country_New Zealand', 'Country_Spain',
       'Country

In [64]:
feature_importance = pd.DataFrame({'feature':X_train.columns,'importance':model.feature_importances_})
pd.set_option('display.max_rows',100)
features_sorted = feature_importance.sort_values('importance', ascending=False)
features_sorted

Unnamed: 0,feature,importance
0,Gender,0.004836
1,Age,0.020186
2,Average Monthly Spend on Entertainment,0.040803
3,Number of Online Purchases in Last Month,0.032099
4,Average Weekly Exercise Hours,0.069665
5,Investment Portfolio Value,0.077869
6,Health Consciousness Rating,0.047403
7,Education Level,0.025305
8,Average Daily Screen Time,0.031755
9,Environmental Awareness Rating,0.069459


In [65]:
X_subset = X_encoded[features_sorted.head(18)['feature']]

In [66]:
X_subset.columns

Index(['Tech-Savviness Score', 'Investment Portfolio Value',
       'Risk Tolerance in Investments', 'Average Weekly Exercise Hours',
       'Environmental Awareness Rating', 'Financial Wellness Index',
       'Social Media Influence Score', 'Health Consciousness Rating',
       'Average Monthly Spend on Entertainment', 'Stress Management Score',
       'Investment Risk Appetite', 'Number of Online Purchases in Last Month',
       'Average Daily Screen Time', 'Education Level',
       'Lifestyle Balance Score', 'Time Management Skill',
       'Work-Life Balance Indicator', 'Eco-Consciousness Metric'],
      dtype='object')

In [67]:
X_train,X_test, y_train,y_test = train_test_split(X_subset,y_encoded,test_size=0.2,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(380540, 18)
(380540,)
(95135, 18)
(95135,)


In [68]:
#Instantiate the model
model = RandomForestClassifier(random_state=42)

#Fit the data
model.fit(X_train,y_train)

#Make predictions

pred_y = model.predict(X_test)

In [69]:
print("Accuracy:", accuracy_score(y_test, pred_y))
print("\nClassification Report:\n", classification_report(y_test, pred_y))

Accuracy: 0.7109265780207075

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.66      0.64      7955
           1       0.50      0.37      0.43      8124
           2       0.81      0.82      0.81      7888
           3       0.81      0.74      0.77      8014
           4       0.68      0.74      0.71      7968
           5       0.58      0.61      0.59      8034
           6       0.69      0.72      0.71      7895
           7       0.83      0.78      0.80      7845
           8       0.88      0.92      0.90      7919
           9       0.85      0.89      0.87      7785
          10       0.74      0.77      0.76      7768
          11       0.53      0.52      0.52      7940

    accuracy                           0.71     95135
   macro avg       0.71      0.71      0.71     95135
weighted avg       0.71      0.71      0.71     95135



In [70]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

### Hyperparamter tuning

In [None]:
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

In [None]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_grid, 
    n_iter=30,  # Number of parameter settings sampled
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1 
)


In [None]:
# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f'Best Parameters: {best_params}')

# Use the best model found by RandomizedSearchCV
best_rf_model = random_search.best_estimator_

# Print the best score
print(f'Best Score: {random_search.best_score_}')
