## USING  DECISION TREE

In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [47]:
df = pd.read_csv('new_data3.csv')

In [48]:
X_demograph = df.loc[:, ['Motive','Income Range','Gender','Education level','Number of Children','Age range','Occupation']]
y_demograph = df['Insured']

In [49]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_dem_resampled, y_dem_resampled = smote.fit_resample(X_demograph, y_demograph)


In [50]:
X_train_dem, X_test_dem, y_train_dem, y_test_dem = train_test_split(X_dem_resampled,y_dem_resampled, test_size = 0.3)

**FITTING THE DEMOGRAPHIC MODEL**

In [51]:
# create decision tree classifier
dtd = DecisionTreeClassifier(random_state=50)

# fit the model on the training data
dtd.fit(X_train_dem, y_train_dem)

# predict on the train set
y_train_pred_dem = dtd.predict(X_train_dem)
# predict on the test set
y_test_pred_dem = dtd.predict(X_test_dem)

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Calculate accuracy
accuracy = accuracy_score(y_test_dem, y_test_pred_dem)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test_dem, y_test_pred_dem)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test_dem, y_test_pred_dem)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test_dem, y_test_pred_dem)
print("F1-score:", f1)


Accuracy: 0.775
Precision: 0.85
Recall: 0.7391304347826086
F1-score: 0.7906976744186046


In [53]:
from sklearn.model_selection import cross_val_score

scored1 = cross_val_score(dtd, X_test_dem,y_test_dem,cv =3)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scored1.mean(), scored1.std()))


0.75 accuracy with a standard deviation of 0.10


In [54]:
# Get feature importances
importances = dtd.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Print feature importance rankings
print("Feature importance rankings:")
for i, index in enumerate(indices):
    print(f"{i+1}. Feature: {X_train_dem.columns[index]} - Importance: {importances[index]}")

Feature importance rankings:
1. Feature: Income Range - Importance: 0.2976048965125364
2. Feature: Motive - Importance: 0.2460615023622008
3. Feature: Number of Children - Importance: 0.13703290988782388
4. Feature: Education level - Importance: 0.11776213345452598
5. Feature: Gender - Importance: 0.06798487812528378
6. Feature: Age range - Importance: 0.06684078891977736
7. Feature: Occupation - Importance: 0.06671289073785173
