### Algorythms comparision

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler

In [27]:
data = np.load(r'..\data\03_processed\02_normalized_data.npy')
df = pd.DataFrame(data)
df = df.rename(columns={1:'Income', 2:'Age', 3:'Experience', 4:'CURRENT_JOB_YRS', 5:'CURRENT_HOUSE_YRS'})
df_categorical= pd.read_csv(r'..\data\02_interim\01_categorical_data.csv')
df_categorical.head()

Unnamed: 0,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,Risk_Flag
0,0,1.0,0,Mechanical_engineer,Rewa,Madhya_Pradesh,0
1,0,1.0,0,Software_Developer,Parbhani,Maharashtra,0
2,1,1.0,0,Technical_writer,Alappuzha,Kerala,0
3,0,1.0,1,Software_Developer,Bhubaneswar,Odisha,1
4,0,1.0,0,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,1


In [28]:
final_df = pd.concat([df, df_categorical], axis=1)
final_df.drop(0, axis=1, inplace=True)
final_df.head()

category_columns = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']

for col in category_columns:
    final_df[col] = final_df[col].astype('category')

print(final_df.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype   
---  ------           --------------   -----   
 0   Income           252000 non-null  float64 
 1   Age              252000 non-null  float64 
 2   Experience       252000 non-null  float64 
 3   CURRENT_JOB_YRS  252000 non-null  float64 
 4   Married/Single   252000 non-null  category
 5   House_Ownership  231898 non-null  category
 6   Car_Ownership    252000 non-null  category
 7   Profession       252000 non-null  category
 8   CITY             252000 non-null  category
 9   STATE            252000 non-null  category
 10  Risk_Flag        252000 non-null  int64   
dtypes: category(6), float64(4), int64(1)
memory usage: 11.3 MB
None


In [30]:
y = final_df['Risk_Flag']
X = final_df.drop('Risk_Flag', axis=1).values

encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
X_train = X_resampled
y_train = y_resampled

### Logistic regresion

In [None]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)

In [None]:
y1_pred = model1.predict(X_test)

accuracy = accuracy_score(y_test, y1_pred)

print("LogisticRegression Model1")
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y1_pred)
print("Confusion Matrix:") 
print(cm)

LogisticRegression Model1
Accuracy: 0.5780952380952381
Confusion Matrix:
[[25315 18832]
 [ 2432  3821]]


### Random Forest

In [None]:
model2 = RandomForestClassifier(n_estimators=50, max_depth=10, max_samples=0.8, random_state=42)
model2.fit(X_train, y_train)

In [None]:
y2_pred = model2.predict(X_test)

accuracy = accuracy_score(y_test, y2_pred)

print("RandomForest Model2")
print("Accuracy:", accuracy)
print(classification_report(y_test, y2_pred))
print(confusion_matrix(y_test, y2_pred))

RandomForest Model2
Accuracy: 0.6441865079365079
              precision    recall  f1-score   support

           0       0.93      0.64      0.76     44147
           1       0.21      0.65      0.31      6253

    accuracy                           0.64     50400
   macro avg       0.57      0.65      0.54     50400
weighted avg       0.84      0.64      0.70     50400

[[28392 15755]
 [ 2178  4075]]


### Decision tree

In [None]:
model3 = DecisionTreeClassifier(random_state=42)
model3.fit(X_train, y_train)

y3_pred = model3.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y3_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y3_pred)
print("Classification Report:")
print(report)

accuracy = accuracy_score(y_test, y3_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[37567  6580]
 [  833  5420]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.85      0.91     44147
           1       0.45      0.87      0.59      6253

    accuracy                           0.85     50400
   macro avg       0.71      0.86      0.75     50400
weighted avg       0.91      0.85      0.87     50400

Accuracy: 0.8529166666666667


### Gradient boosting

In [None]:
model4 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
model4.fit(X_train, y_train)

In [None]:
y4_pred = model4.predict(X_test)

cm = confusion_matrix(y_test, y4_pred)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, y4_pred)
print("Classification Report:")
print(report)

accuracy = accuracy_score(y_test, y4_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[21498 22649]
 [ 2032  4221]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.49      0.64     44147
           1       0.16      0.68      0.25      6253

    accuracy                           0.51     50400
   macro avg       0.54      0.58      0.45     50400
weighted avg       0.82      0.51      0.59     50400

Accuracy: 0.510297619047619


# Insights on Classification Trees

### Accuracy
- Overall accuracy: **85.29%**
## Key Insights

1. **High Recall for Class 1**: The model effectively identifies 87% of actual positives.
2. **Low Precision for Class 1**: The model has a significant number of false positives, with a precision of 0.45.
3. **Performance Imbalance**: The model performs exceptionally well on Class 0 but less so on Class 1.

## Recommendations

1. **Balance the Dataset**: Use oversampling or undersampling techniques.
2. **Adjust the Decision Threshold**: Tune the threshold to improve precision for Class 1.
3. **Ensemble Methods**: Implement Random Forest or Gradient Boosting for better performance.
4. **Feature Engineering**: Enhance features to improve class distinction.
5. **Regularization**: Apply pruning to reduce overfitting.

By addressing these areas, the classification tree model's performance, particularly for the minority class, can be significantly improved.
```