In [45]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [46]:
# Load your dataset
df = pd.read_csv("german_credit_data.csv")
# Drop duplicates
df.drop_duplicates(inplace=True)
# Identify missing values
missing_values = df.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])
# Define categorical columns with missing values
categorical_columns = ['Saving accounts', 'Checking account']
# Fill missing values with mode for each categorical column
for col in categorical_columns:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]  # Calculate mode
        df[col].fillna(mode_value, inplace=True)
# Check for missing values after filling
missing_vals = df.isnull().sum()
print("Columns with missing values after filling:")
print(missing_vals[missing_vals > 0])
# Calculate z-scores to identify outliers
#z_scores = np.abs(stats.zscore(df_numeric))
z_scores=np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64'])))
# Remove outliers based on z-scores
df = df[(z_scores < 3).all(axis=1)]
# Identify inconsistent rows
inco = df[df.apply(lambda x: x.astype(str).str.contains('inconsistent_value')).any(axis=1)]
# Remove inconsistent rows
df = df[~df.index.isin(inco.index)]

#unique columns of categorical variables
cat=df.select_dtypes(include=['object']).columns
for column in cat:
    unique=df[column].unique()
    print(f"unique values in column '{column}':{unique}")
#irrelevant column
df=df.drop(df.columns[0], axis=1)
# Save the cleaned dataframe
df.to_csv("cleaned_german.csv", index=False)
print(df)

FileNotFoundError: [Errno 2] No such file or directory: 'german_credit_data.csv'

In [None]:
df= pd.read_csv("cleaned_german.csv")
print(df.head())

In [None]:
df.info()

EDA

In [None]:
#summary statistics
df.describe(include='all')
# Saving accounts, Checking account,Housing, Purpose and Risk are categorical variables implying no mean,std,min,quartiles range etc

In [None]:
#plot histograms to visualize the distributions
df.hist(figsize=(10,8))
plt.show()

In [None]:
#What is the correlation between Credit amount and Duration?
dft=df.loc[:, ['Credit amount', 'Duration']]
corr_matrix=dft.corr()
print(corr_matrix)
#heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
df


In [None]:
#What is the relationship between savings and risk assessment in the credit scoring dataset, and why do applicants with substantial savings tend to have lower perceived risk?
high_risk_high_savings = df[(df['Saving accounts'] == 'quite rich') | 
                            (df['Saving accounts'] == 'rich') & 
                            (df['Risk'] == 'high')].head(10)
print("Applicants with substantial savings but high risk:")
print(high_risk_high_savings)

In [None]:
#Interesting case 2
#Which applicants have a high credit amount but a short duration for repayment?
high_credit_short_duration = df[(df['Credit amount'] > df['Credit amount'].quantile(0.75)) & 
                                (df['Duration'] < df['Duration'].quantile(0.25))].head(10)

print("Applicants with high credit amounts but short duration for repayment:")
print(high_credit_short_duration)

In [None]:
#Which applicants have a credit amount significantly higher than the average for their job type?
average_credit_by_job=df.groupby('Job')['Credit amount'].mean()
interesting_case_3=df[df['Credit amount']>average_credit_by_job[df['Job']].values+500].head(10)
print("selected interesting cases for presentation:")
print(interesting_case_3[['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'Purpose', 'Risk']])

In [None]:
df.dtypes

In [None]:
#Q E
df["Risk"] = df["Risk"].astype('category')
df.dtypes

In [None]:
df["Risk"] = df["Risk"].cat.codes
df.head(10)

In [None]:
data= pd.read_csv("cleaned_german.csv")

In [None]:
X = data.drop('Risk', axis=1)  # Features
y = data['Risk']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorical_features = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
numeric_features = ['Age', 'Job', 'Credit amount', 'Duration']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

model.fit(X_train, y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Data Insights

# END