In [1]:
import pandas as pd

df = pd.read_csv('german_credit_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,0,67,male,2,own,,little,1169,6,radio/TV
1,1,22,female,2,own,little,moderate,5951,48,radio/TV
2,2,49,male,1,own,little,,2096,12,education
3,3,45,male,2,free,little,little,7882,42,furniture/equipment
4,4,53,male,2,free,little,little,4870,24,car


In [3]:
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
print(df.dtypes)
df.describe(include='all')

Dataset contains 1000 rows and 10 columns
Unnamed: 0           int64
Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
dtype: object


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
count,1000.0,1000.0,1000,1000.0,1000,817,606,1000.0,1000.0,1000
unique,,,2,,3,4,3,,,8
top,,,male,,own,little,little,,,car
freq,,,690,,713,603,274,,,337
mean,499.5,35.546,,1.904,,,,3271.258,20.903,
std,288.819436,11.375469,,0.653614,,,,2822.736876,12.058814,
min,0.0,19.0,,0.0,,,,250.0,4.0,
25%,249.75,27.0,,2.0,,,,1365.5,12.0,
50%,499.5,33.0,,2.0,,,,2319.5,18.0,
75%,749.25,42.0,,2.0,,,,3972.25,24.0,


In [5]:
# Handling missing values
print(df.isnull().sum())

numeric_df = df.select_dtypes(include=['number'])
df[numeric_df.columns] = df[numeric_df.columns].fillna(numeric_df.median())

categorical_cols = df.select_dtypes(exclude=['number']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

print(df.isnull().sum())


Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64
Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
dtype: int64


In [None]:
# One-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_cols = df_encoded.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
df_encoded.head()


In [None]:
# Specify the target column (replace 'CreditRisk' with the actual column name for the target)
target_column = 'CreditRisk'
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

# Display the shapes of X and y
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


In [None]:
# Save the preprocessed data to CSV files
X_train.to_csv('X_train_preprocessed.csv', index=False)
X_test.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)

print("Preprocessed data saved successfully.")
