### Problem Statement
Build a Decision Tree & Random Forest model on the fraud data. Treat those who have taxable_income <= 30000 as Risky and others as Good (discretize the taxable 

### Business objective
The objective is to predict whether a given individual falls into a "Risky" or "Good" category based on the features provided, using Decision Tree and Random Forest models.

In [3]:
import pandas as pd
import numpy as np
df=pd.read_csv("Fraud_check.csv")
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [5]:
df.shape

(600, 6)

In [7]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.3+ KB


In [11]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

### Data Transformation

In [14]:
df['Risk'] = np.where(df['Taxable.Income'] <= 30000, 'Risky', 'Good')

In [16]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Risk
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


remove Taxable.income as we no longer need 

In [19]:
df = df.drop(['Taxable.Income'], axis=1)

In [21]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Risk
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good


### Label Encoding

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Undergrad'] = le.fit_transform(df['Undergrad'])
df['Marital.Status'] = le.fit_transform(df['Marital.Status'])
df['Urban'] = le.fit_transform(df['Urban'])
df['Risk'] = le.fit_transform(df['Risk'])  # Target variable


### Split features and target variable

In [27]:
from sklearn.model_selection import train_test_split
X = df.drop(['Risk'], axis=1)
y = df['Risk']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### build decision tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Model")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Model
Accuracy: 0.6277777777777778
[[105  38]
 [ 29   8]]
              precision    recall  f1-score   support

           0       0.78      0.73      0.76       143
           1       0.17      0.22      0.19        37

    accuracy                           0.63       180
   macro avg       0.48      0.48      0.48       180
weighted avg       0.66      0.63      0.64       180



### Build random forest model

In [33]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Model")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Model
Accuracy: 0.7555555555555555
[[136   7]
 [ 37   0]]
              precision    recall  f1-score   support

           0       0.79      0.95      0.86       143
           1       0.00      0.00      0.00        37

    accuracy                           0.76       180
   macro avg       0.39      0.48      0.43       180
weighted avg       0.62      0.76      0.68       180

