Project

In [34]:
# Import required libraries
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [35]:
# Load dataset (German Credit Data from UCI)
statlog_data = fetch_ucirepo(id=144)
X = statlog_data.data.features
y = statlog_data.data.targets

In [36]:
# Rename columns based on UCI documentation
X.columns = [
    "Status_checking_account", "Duration_month", "Credit_history", "Purpose",
    "Credit_amount", "Savings_account", "Employment_since", "Installment_rate",
    "Personal_status_sex", "Other_debtors_guarantors", "Residence_since",
    "Property", "Age_years", "Other_installment_plans", "Housing",
    "Existing_credits", "Job", "Num_liable", "Telephone", "Foreign_worker"
]

# Convert target: 1 = good credit, 2 = bad credit
y = y.replace({1: "Good", 2: "Bad"})
if isinstance(y, pd.DataFrame):
    y = y.rename(columns={y.columns[0]: "Credit_risk"})
else:
    y = y.rename("Credit_risk")

# Combine features and target
df = pd.concat([X, y], axis=1)

In [37]:
# Descriptive analysis
print("Dataset overview")
print(df.head(), "\n")
print("Shape:", df.shape, "\n")
print("Data types:\n", df.dtypes, "\n")
print("Target distribution:\n", df["Credit_risk"].value_counts(), "\n")
print("Numeric summary:\n", df.describe(), "\n")

Dataset overview
  Status_checking_account  Duration_month Credit_history Purpose  \
0                     A11               6            A34     A43   
1                     A12              48            A32     A43   
2                     A14              12            A34     A46   
3                     A11              42            A32     A42   
4                     A11              24            A33     A40   

   Credit_amount Savings_account Employment_since  Installment_rate  \
0           1169             A65              A75                 4   
1           5951             A61              A73                 2   
2           2096             A61              A74                 2   
3           7882             A61              A74                 2   
4           4870             A61              A73                 3   

  Personal_status_sex Other_debtors_guarantors  ...  Property Age_years  \
0                 A93                     A101  ...      A121        67 

In [38]:
# Preprocessing
# Separate features and target
X = df.drop("Credit_risk", axis=1)
y = df["Credit_risk"]

# Encode categorical columns using LabelEncoder
label_encoder = LabelEncoder()
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = label_encoder.fit_transform(X[col])

# Encode target labels (Good = 1, Bad = 0)
y = y.map({"Good": 1, "Bad": 0})

# Standardize numeric features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split dataset into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [39]:
# Problem formalization
# Binary classification problem:
# Predict whether a person has good or bad credit risk.

In [40]:
# 6. Baseline model: Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [41]:
# Evaluation
print("Baseline Model Evaluation")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

Baseline Model Evaluation
Accuracy: 0.78

Classification report:
               precision    recall  f1-score   support

           0       0.67      0.49      0.57        59
           1       0.81      0.90      0.85       141

    accuracy                           0.78       200
   macro avg       0.74      0.70      0.71       200
weighted avg       0.77      0.78      0.77       200



In [42]:
# Save preprocessed dataset
df.to_csv("german_credit_preprocessed.csv", index=False, sep=';', encoding='utf-8-sig')
print("Preprocessed dataset saved as 'german_credit_preprocessed.csv'")

Preprocessed dataset saved as 'german_credit_preprocessed.csv'
