## USING RANDOM FOREST

In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [94]:
df = pd.read_csv('new_data3.csv')

In [95]:
X_demograph = df.loc[:, ['Motive','Income Range','Gender','Education level','Number of Children','Age range','Occupation']]
y_demograph = df['Insured']

In [96]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

X_dem_resampled, y_dem_resampled = smote.fit_resample(X_demograph, y_demograph)

In [97]:
X_train_dem, X_test_dem, y_train_dem, y_test_dem = train_test_split(X_dem_resampled,y_dem_resampled, test_size = 0.3)

**FITTING THE DEMOGRAPHIC MODEL**

In [98]:
# initialize random forest classifier
rfcd = RandomForestClassifier(n_estimators=100, random_state=25)

# fit the model
rfcd.fit(X_train_dem, y_train_dem)

# predict on the train set
y_train_pred_dem = rfcd.predict(X_train_dem)
# predict on the test set
y_test_pred_dem = rfcd.predict(X_test_dem)

In [99]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Accuracy
accuracy = accuracy_score(y_test_dem, y_test_pred_dem)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_test_dem, y_test_pred_dem)
print("Precision:", precision)

# Recall
recall = recall_score(y_test_dem, y_test_pred_dem)
print("Recall:", recall)

# F1-score
f1 = f1_score(y_test_dem, y_test_pred_dem)
print("F1-score:", f1)


Accuracy: 0.8
Precision: 0.8235294117647058
Recall: 0.7368421052631579
F1-score: 0.7777777777777778


In [100]:
from sklearn.model_selection import cross_val_score

scored1 = cross_val_score(rfcd, X_test_dem,y_test_dem,cv =5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scored1.mean(), scored1.std()))


0.82 accuracy with a standard deviation of 0.10


In [101]:
# Get feature importances
importances = rfcd.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Print feature importance rankings
print("Feature importance rankings:")
for i, index in enumerate(indices):
    print(f"{i+1}. Feature: {X_train_dem.columns[index]} - Importance: {importances[index]}")

Feature importance rankings:
1. Feature: Income Range - Importance: 0.2630369760532254
2. Feature: Education level - Importance: 0.20740882173965194
3. Feature: Motive - Importance: 0.16165606531322205
4. Feature: Number of Children - Importance: 0.15167537724675015
5. Feature: Occupation - Importance: 0.08778110492622122
6. Feature: Age range - Importance: 0.07293642988128919
7. Feature: Gender - Importance: 0.055505224839640085
