In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE  
import catboost as cb
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('german.data', delimiter=' ', header=None, encoding='latin1')
print(df.head())

    0   1    2    3     4    5    6   7    8     9   ...    11  12    13  \
0  A11   6  A34  A43  1169  A65  A75   4  A93  A101  ...  A121  67  A143   
1  A12  48  A32  A43  5951  A61  A73   2  A92  A101  ...  A121  22  A143   
2  A14  12  A34  A46  2096  A61  A74   2  A93  A101  ...  A121  49  A143   
3  A11  42  A32  A42  7882  A61  A74   2  A93  A103  ...  A122  45  A143   
4  A11  24  A33  A40  4870  A61  A73   3  A93  A101  ...  A124  53  A143   

     14 15    16 17    18    19 20  
0  A152  2  A173  1  A192  A201  1  
1  A152  1  A173  1  A191  A201  2  
2  A152  1  A172  2  A191  A201  1  
3  A153  1  A173  2  A191  A201  1  
4  A153  2  A173  2  A191  A201  2  

[5 rows x 21 columns]


In [3]:
columns = ['checking_account','duration', 'credit_history', 'purpose', 'credit_amount', 'savings_account', 'employment', 
           'installment_rate', 'personal_status', 'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans', 
           'housing', 'existing_credits', 'job', 'liable_people', 'telephone', 'foreign_worker', 'target']

In [4]:
df.columns = columns

In [5]:
# Convert categorical features using LabelEncoder
categorical_columns = ['checking_account', 'credit_history', 'purpose', 'savings_account', 
                       'employment', 'personal_status', 'other_debtors', 'property', 
                       'other_installment_plans', 'housing', 'job', 'telephone', 'foreign_worker']

In [6]:
# Use LabelEncoder to encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

In [7]:
df_encoded = pd.get_dummies(df, columns=categorical_columns)

In [8]:
print(df_encoded.head())

   duration  credit_amount  installment_rate  residence_since  age  \
0         6           1169                 4                4   67   
1        48           5951                 2                2   22   
2        12           2096                 2                3   49   
3        42           7882                 2                4   45   
4        24           4870                 3                4   53   

   existing_credits  liable_people  target  checking_account_0  \
0                 2              1       1                True   
1                 1              1       2               False   
2                 1              2       1               False   
3                 1              2       1                True   
4                 2              2       2                True   

   checking_account_1  ...  housing_1  housing_2  job_0  job_1  job_2  job_3  \
0               False  ...       True      False  False  False   True  False   
1                True 

In [9]:
X = df.drop('target', axis=1)
y = df['target']

In [10]:
# Handle class imbalance 
smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

DECISION TREE

In [13]:
decision_tree = DecisionTreeClassifier(random_state=42)

In [14]:
decision_tree.fit(X_train, y_train)

In [15]:
y_pred_tree = decision_tree.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred_tree)
precision = precision_score(y_test, y_pred_tree)
recall = recall_score(y_test, y_pred_tree)
f1 = f1_score(y_test, y_pred_tree)

print(f"Decision Tree:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Decision Tree:
Accuracy: 0.7548
Precision: 0.7778
Recall: 0.7385
F1-Score: 0.7576


RANDOM FOREST MODEL

In [17]:
random_forest = RandomForestClassifier(random_state=42)

In [18]:
random_forest.fit(X_train, y_train)


In [19]:
y_pred_forest = random_forest.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred_forest)
precision = precision_score(y_test, y_pred_forest)
recall = recall_score(y_test, y_pred_forest)
f1 = f1_score(y_test, y_pred_forest)

y_pred_rf = random_forest.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_forest))

# Tính Feature Importances cho Random Forest
importances = random_forest.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

Random Forest Report:
               precision    recall  f1-score   support

           1       0.84      0.79      0.82       218
           2       0.79      0.84      0.82       202

    accuracy                           0.82       420
   macro avg       0.82      0.82      0.82       420
weighted avg       0.82      0.82      0.82       420



Trying Catboost


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import catboost as cb
from sklearn.metrics import accuracy_score

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

In [24]:

cat_model = cb.CatBoostClassifier(iterations=1000, learning_rate=0.6, depth=4, verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
cat_accuracy = accuracy_score(y_test, y_pred_cat)

print(f"Accuracy of CatBoost: {cat_accuracy * 100:.2f}%")



Accuracy of CatBoost: 86.43%
