In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../data/credit_data.csv', header=1)

# Remove any unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Rename target column
df = df.rename(columns={'Y': 'default'})

# Convert data types
for col in df.columns:
    if col != 'default':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Convert 'default' column to integer
df['default'] = pd.to_numeric(df['default'], errors='coerce').astype(int)

# Drop rows with missing values
df = df.dropna()

# Display first few rows
df.head()

# Basic information about the dataset
df.info()

# Statistical summary
df.describe()

# Distribution of the target variable
sns.countplot(x='default', data=df)
plt.title('Distribution of Default Variable')
plt.show()

# Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Histograms of numerical features
df.hist(figsize=(20,20))
plt.show()

# Box plots to detect outliers
numerical_features = ['X1', 'X5', 'X12', 'X18']  # Replace with actual numerical columns
for feature in numerical_features:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[feature])
    plt.title(f'Boxplot of {feature}')
    plt.show()
