In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [5]:
# Load the dataset
data = pd.read_csv('german_credit_cleaned.csv')

# Display the first few rows of the dataset
print(data.head())


  checking_acc_status  duration                     cred_hist  \
0             below_0         6  risky_acc_or_curr_loan_other   
1           below_200        48          curr_loans_paid_duly   
2      no_cheking_acc        12  risky_acc_or_curr_loan_other   
3             below_0        42          curr_loans_paid_duly   
4             below_0        24                 delay_in_past   

               purpose  loan_amt       saving_acc_bonds  \
0             radio_tv      1169  unknown_no_saving_acc   
1             radio_tv      5951              below_100   
2            education      2096              below_100   
3  furniture_equipment      7882              below_100   
4              car_new      4870              below_100   

  present_employment_since  installment_rate        personal_stat_gender  \
0                 above_7y                 4                 male:single   
1                 below_4y                 2  female:divorced_or_married   
2                 below_7y

In [6]:
# Check for missing values
print(data.isnull().sum())

# Drop or fill missing values (if any)
data.fillna(method='ffill', inplace=True)


checking_acc_status         0
duration                    0
cred_hist                   0
purpose                     0
loan_amt                    0
saving_acc_bonds            0
present_employment_since    0
installment_rate            0
personal_stat_gender        0
other_debtors_guarantors    0
present_residence_since     0
property                    0
age                         0
other_installment_plans     0
housing                     0
num_curr_loans              0
job                         0
num_people_provide_maint    0
telephone                   0
is_foreign_worker           0
target                      0
dtype: int64


In [7]:
# Convert categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, drop_first=True)


In [9]:
print(data_encoded.columns)


Index(['duration', 'loan_amt', 'installment_rate', 'present_residence_since',
       'age', 'num_curr_loans', 'num_people_provide_maint',
       'checking_acc_status_below_0', 'checking_acc_status_below_200',
       'checking_acc_status_no_cheking_acc', 'cred_hist_delay_in_past',
       'cred_hist_no_loan_or_paid_duly_other', 'cred_hist_paid_duly_this_bank',
       'cred_hist_risky_acc_or_curr_loan_other', 'purpose_car_new',
       'purpose_car_used', 'purpose_domestic_applience', 'purpose_education',
       'purpose_furniture_equipment', 'purpose_others', 'purpose_radio_tv',
       'purpose_repairs', 'purpose_retraining', 'saving_acc_bonds_below_100',
       'saving_acc_bonds_below_1000', 'saving_acc_bonds_below_500',
       'saving_acc_bonds_unknown_no_saving_acc',
       'present_employment_since_below_1y',
       'present_employment_since_below_4y',
       'present_employment_since_below_7y',
       'present_employment_since_unemployed',
       'personal_stat_gender_male:divorced',

In [10]:
# Define features and target
X = data_encoded.drop('target_good', axis=1)
y = data_encoded['target_good']


In [11]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [12]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)


In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = clf.predict(X_test_scaled)

# Evaluate the accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

       False       0.70      0.36      0.48        91
        True       0.77      0.93      0.84       209

    accuracy                           0.76       300
   macro avg       0.74      0.65      0.66       300
weighted avg       0.75      0.76      0.73       300



In [15]:
from sklearn.preprocessing import PolynomialFeatures

# Initialize polynomial features with degree 2
poly = PolynomialFeatures(degree=2, include_bias=False)

# Apply polynomial transformation
X_poly = poly.fit_transform(X)
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out())


In [16]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model to get feature importances
clf = RandomForestClassifier(random_state=42)
clf.fit(X, y)

# Get feature importances
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))


                                         Feature  Importance
1                                       loan_amt    0.111726
0                                       duration    0.096467
4                                            age    0.090544
9             checking_acc_status_no_cheking_acc    0.055647
7                    checking_acc_status_below_0    0.039590
3                        present_residence_since    0.038043
2                               installment_rate    0.036408
5                                 num_curr_loans    0.023687
13        cred_hist_risky_acc_or_curr_loan_other    0.023094
23                    saving_acc_bonds_below_100    0.022078
33              personal_stat_gender_male:single    0.020269
43                          job_skilled_official    0.020095
39                  other_installment_plans_none    0.019913
46                                 telephone_yes    0.019664
14                               purpose_car_new    0.019375
37                      

In [17]:
from sklearn.preprocessing import StandardScaler

# Initialize and fit StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [18]:
correlation_matrix = data_encoded.corr()
highly_correlated_features = correlation_matrix.index[abs(correlation_matrix['target_good']) > 0.5]
print(highly_correlated_features)


Index(['target_good'], dtype='object')


In [19]:
# Create new polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out())

# Standardize the new features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly_df)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

       False       0.73      0.44      0.55        91
        True       0.79      0.93      0.85       209

    accuracy                           0.78       300
   macro avg       0.76      0.68      0.70       300
weighted avg       0.77      0.78      0.76       300



In [22]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
gbc = GradientBoostingClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Evaluate the best model
best_gbc = grid_search.best_estimator_
y_pred = best_gbc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)




Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best score: 0.7385714285714287
Accuracy: 0.7533333333333333
Classification Report:
              precision    recall  f1-score   support

       False       0.68      0.35      0.46        91
        True       0.77      0.93      0.84       209

    accuracy                           0.75       300
   macro avg       0.72      0.64      0.65       300
weighted avg       0.74      0.75      0.73       300

