In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [76]:
project=pd.read_csv("project l2.csv")

In [77]:
project.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Unnamed: 18,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2,,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2,,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,,1


In [78]:
project.isnull().sum()

Ind_ID                0
GENDER                7
Car_Owner             0
Propert_Owner         0
CHILDREN              0
Annual_income        23
Type_Income           0
EDUCATION             0
Marital_status        0
Housing_type          0
Birthday_count       22
Employed_days         0
Mobile_phone          0
Work_Phone            0
Phone                 0
EMAIL_ID              0
Type_Occupation     488
Family_Members        0
Unnamed: 18        1548
label                 0
dtype: int64

In [79]:
# Drop the empty unnamed column
project.drop('Unnamed: 18', axis=1, inplace=True, errors='ignore')


# Fill missing categorical values with 'Unknown'
project['GENDER'] = project['GENDER'].fillna('Unknown')

# Fill missing Type_Occupation with 'Retired' if income type is 'Pensioner'
project.loc[(project['Type_Income'] == 'Pensioner') & (project['Type_Occupation'].isnull()), 'Type_Occupation'] = 'Retired'

project['Type_Occupation'] = project['Type_Occupation'].fillna('Unknown')

# Using unknwon for birthday count rather than just taking median
project['Birthday_count'] = project['Birthday_count'].fillna('0')


# Fill missing numeric values with median
project['Annual_income'] = project['Annual_income'].fillna(project['Annual_income'].median())





In [80]:
project.isnull().sum()

Ind_ID             0
GENDER             0
Car_Owner          0
Propert_Owner      0
CHILDREN           0
Annual_income      0
Type_Income        0
EDUCATION          0
Marital_status     0
Housing_type       0
Birthday_count     0
Employed_days      0
Mobile_phone       0
Work_Phone         0
Phone              0
EMAIL_ID           0
Type_Occupation    0
Family_Members     0
label              0
dtype: int64

In [81]:
# checking datatypes
project.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ind_ID           1548 non-null   int64  
 1   GENDER           1548 non-null   object 
 2   Car_Owner        1548 non-null   object 
 3   Propert_Owner    1548 non-null   object 
 4   CHILDREN         1548 non-null   int64  
 5   Annual_income    1548 non-null   float64
 6   Type_Income      1548 non-null   object 
 7   EDUCATION        1548 non-null   object 
 8   Marital_status   1548 non-null   object 
 9   Housing_type     1548 non-null   object 
 10  Birthday_count   1548 non-null   object 
 11  Employed_days    1548 non-null   int64  
 12  Mobile_phone     1548 non-null   int64  
 13  Work_Phone       1548 non-null   int64  
 14  Phone            1548 non-null   int64  
 15  EMAIL_ID         1548 non-null   int64  
 16  Type_Occupation  1548 non-null   object 
 17  Family_Members

In [82]:
 project['Birthday_count'] = pd.to_numeric(project['Birthday_count'], errors='coerce')
project['Age'] = (-project['Birthday_count']) // 365  # approximate age in years

 project['Employed_days'] = pd.to_numeric(project['Employed_days'], errors='coerce')

project['Employment_years'] = project['Employed_days'].apply(lambda x: (-x / 365) if x < 0 else np.nan)
 # approximate working years and replacing unemployed with nan



In [83]:
project.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Ind_ID            1548 non-null   int64  
 1   GENDER            1548 non-null   object 
 2   Car_Owner         1548 non-null   object 
 3   Propert_Owner     1548 non-null   object 
 4   CHILDREN          1548 non-null   int64  
 5   Annual_income     1548 non-null   float64
 6   Type_Income       1548 non-null   object 
 7   EDUCATION         1548 non-null   object 
 8   Marital_status    1548 non-null   object 
 9   Housing_type      1548 non-null   object 
 10  Birthday_count    1548 non-null   float64
 11  Employed_days     1548 non-null   int64  
 12  Mobile_phone      1548 non-null   int64  
 13  Work_Phone        1548 non-null   int64  
 14  Phone             1548 non-null   int64  
 15  EMAIL_ID          1548 non-null   int64  
 16  Type_Occupation   1548 non-null   object 


In [84]:
#dropping ind_id
project.drop(["Ind_ID"],axis=1,inplace=True)

In [85]:
# List of columns to one-hot encode
one_hot_cols = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 
                'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation']

# Apply encoding and removing dummy variables for multicollinearity for logistic regression
project = pd.get_dummies(project, columns=one_hot_cols, drop_first=True)


In [86]:
project.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 51 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   CHILDREN                                 1548 non-null   int64  
 1   Annual_income                            1548 non-null   float64
 2   Birthday_count                           1548 non-null   float64
 3   Employed_days                            1548 non-null   int64  
 4   Mobile_phone                             1548 non-null   int64  
 5   Work_Phone                               1548 non-null   int64  
 6   Phone                                    1548 non-null   int64  
 7   EMAIL_ID                                 1548 non-null   int64  
 8   Family_Members                           1548 non-null   int64  
 9   label                                    1548 non-null   int64  
 10  Age                                      1548 no

In [93]:
#splitting into target and independent variable

X = project.drop(columns=['label'])
y = project['label']

In [94]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [95]:
# Decision Tree using entropy criterion
model_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
model_entropy.fit(X_train, y_train)

In [96]:
# Predict the test data and calculate the model accuracy
y_pred_entropy = model_entropy.predict(X_test)
print("Accuracy of Decision Tree using ENTROPY criterion:", accuracy_score(y_test, y_pred_entropy))

Accuracy of Decision Tree using ENTROPY criterion: 0.9


In [101]:
train_acc = model_entropy.score(X_train, y_train)
test_acc = model_entropy.score(X_test, y_pred_entropy)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")


Training Accuracy: 0.9935
Testing Accuracy: 1.0000


In [None]:
# model is overfitted with high tsesting accuracy. now, will try with tuning the hyperparameters.


In [104]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create base model
dt = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1)

# Fit model on training data
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Print results
print("Best Parameters:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# Detailed performance metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Optional: training vs testing accuracy check for overfitting
train_acc = best_model.score(X_train, y_train)
test_acc = best_model.score(X_test, y_test)
print(f"\nTraining Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Test Accuracy: 0.9064516129032258

Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       280
           1       0.67      0.07      0.12        30

    accuracy                           0.91       310
   macro avg       0.79      0.53      0.54       310
weighted avg       0.89      0.91      0.87       310

Confusion Matrix:
 [[279   1]
 [ 28   2]]

Training Accuracy: 0.8974
Testing Accuracy: 0.9065


In [105]:
print(project['label'].value_counts())


label
0    1373
1     175
Name: count, dtype: int64


In [None]:
# our dataset has problem of imbalance so we cannot use smote or class weights because of missing values. so doing high gradient boosting classifier as it is good for 
# missing values.

In [107]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load your dataset 

df = project.copy()

# Separate features and target
X = df.drop(columns=['label'])  # assuming 'label' is your target
y = df['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize the model
model = HistGradientBoostingClassifier(
    random_state=42,
    class_weight='balanced'  # This handles class imbalance
)

# Fit the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8903225806451613

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       275
           1       0.51      0.51      0.51        35

    accuracy                           0.89       310
   macro avg       0.73      0.73      0.73       310
weighted avg       0.89      0.89      0.89       310


Confusion Matrix:
 [[258  17]
 [ 17  18]]


In [108]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distributions to search
param_dist = {
    'max_iter': randint(50, 200),          # Number of boosting iterations
    'max_depth': randint(3, 15),           # Depth of each tree
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    'max_leaf_nodes': randint(10, 50)      # Max leaves per tree
}

# Initialize base model
model = HistGradientBoostingClassifier(random_state=42, class_weight='balanced')

# Randomized search with 20 different combinations, 5-fold cross-validation
search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit search on training data
search.fit(X_train, y_train)

# Best model from search
best_model = search.best_estimator_

# Predict and evaluate on test set
y_pred = best_model.predict(X_test)

print("Best Parameters:", search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Parameters: {'learning_rate': 0.2, 'max_depth': 10, 'max_iter': 89, 'max_leaf_nodes': 30}
Test Accuracy: 0.9032258064516129

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95       275
           1       0.58      0.51      0.55        35

    accuracy                           0.90       310
   macro avg       0.76      0.73      0.75       310
weighted avg       0.90      0.90      0.90       310


Confusion Matrix:
 [[262  13]
 [ 17  18]]
