In [None]:
# # Install pandas, seaborn, scikit-learn if you dont already have it
# !pip install scikit-learn

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold

data_folder = os.getcwd().split("loan_approval_prediction")[0] + "loan_approval_prediction\\data\\"

In [None]:
credit_risk_df = pd.read_csv(data_folder + 'credit_risk_dataset.csv')  
credit_risk_df.head()

In [4]:
df = credit_risk_df.copy()

### Question: I need to do some data exploration in python. What are some things I need to check

To start data exploration in Python, there are a few essential steps that will give you a strong understanding of your dataset. Here's a general checklist:

1. Basic Information

- Shape of the Data: Use .shape to check the number of rows and columns.
- Data Types: Use .dtypes to inspect the types of each column.
- Initial View: Display the first few rows using .head() and last few rows with .tail() to get an overview.

2. Null and Missing Values
- Check for Nulls: Use .isnull().sum() to see the count of missing values in each column.
- Fill or Drop: Decide whether to fill or drop null values, depending on the analysis context.

3. Statistical Summaries
- Summary Statistics: Use .describe() for a quick look at mean, median, standard deviation, etc., for numerical columns.
- Categorical Summaries: Use .value_counts() on categorical columns to see distributions.

4. Data Distribution
- Numerical Distribution: Use histograms (sns.histplot() or .plot.hist()) to visualize the spread of numerical data.
- Box Plots: Use sns.boxplot() to identify outliers and the spread of numerical values.
- Categorical Distributions: Use bar charts for categorical variables to see distribution (sns.countplot()).

5. Correlation Analysis
- Correlation Matrix: Use .corr() to see correlation coefficients between numerical variables, and visualize with a heatmap (sns.heatmap()).
- Pair Plots: Use sns.pairplot() for visualizing pairwise relationships in the data.

6. Identifying Duplicates
- Duplicate Rows: Check for duplicate rows with .duplicated().sum().
- Drop Duplicates: Use .drop_duplicates() if necessary.

7. Data Types and Conversion
- Data Type Inspection: Ensure data types are correct for each column (e.g., datetime columns should be in datetime format).
- Convert Types: Use pd.to_datetime() for dates or .astype() for numerical or categorical conversions.

8. Outliers Detection
- Box Plots for Outliers: Revisit box plots to locate outliers in numerical data.
- Z-Scores or IQR: Use Z-scores or the Interquartile Range (IQR) method to identify outliers programmatically.

9. Feature Engineering Ideas
- Create New Features: Look for opportunities to engineer new features from existing data (e.g., creating age from a birth date).
- Label Encoding: Use LabelEncoder or pd.get_dummies() for categorical variables if planning for machine learning.

10. Visualizations
- Distribution Visualizations: For better understanding, use scatter plots, pair plots, or bar charts (sns.scatterplot(), sns.barplot()).
- Time Series Analysis: If you have time-series data, use line plots (sns.lineplot()) for trends over time.


These steps will give you a broad understanding of your data and reveal areas that may need cleaning, transforming, or further analysis. Let me know if you’d like specific code snippets for any of these steps.

In [None]:
# Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset
# df = pd.read_csv("your_data.csv")

# 1. Basic Information
print("Shape of the data:", df.shape)
print("\nData Types:\n", df.dtypes)

In [None]:
print("\nFirst few rows:\n")
df.head()

In [None]:
print("\nLast few rows:\n")
df.tail()

In [None]:
# 2. Null and Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# Optional: Fill or drop nulls
# df = df.dropna()  # Drop all rows with any null values
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)  # Fill nulls in a specific column with mean


In [None]:
df.isnull().sum().reset_index().rename(columns={0:'Total'}).merge(
    df[df['loan_status'] == 1].isnull().sum().reset_index().rename(columns={0:'Approved'})).merge(
    df[df['loan_status'] == 0].isnull().sum().reset_index().rename(columns={0:'Rejected'}))

In [None]:
# 3. Statistical Summaries
print("\nSummary Statistics:\n")
df.describe()

In [None]:
for col in df.select_dtypes(include='object').columns:
    total = df[col].value_counts().reset_index().rename(columns={'count':'total'})
    approved = df[df['loan_status'] == 1][col].value_counts().reset_index().rename(columns={'count':'approved'})
    rejected = df[df['loan_status'] == 0][col].value_counts().reset_index().rename(columns={'count':'rejected'})
    summary = total.merge(approved).merge(rejected)
    summary['approved_percentage'] = round(summary['approved'] / summary['total'], 2)
    print(f"\nValue Counts for {col}:\n", summary)


In [None]:
# 4. Data Distribution
# Plot histograms for numerical columns
df.select_dtypes(include=['float64', 'int64']).hist(bins=15, figsize=(15, 10))
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Box Plot for each numerical feature
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=['float64', 'int64']))
plt.title("Boxplot for Numerical Features")
plt.show()

# Categorical Distribution
for col in df.select_dtypes(include='object').columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f"Count plot for {col}")
    plt.show()

In [None]:
# 6. Identifying Duplicates
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

# Optional: Drop duplicates
# df = df.drop_duplicates()

In [None]:
# Show all instances of duplicated rows
all_duplicated_rows = df[df.duplicated(keep=False)]
all_duplicated_rows.sort_values(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'])

In [21]:
df["duplicated"] = df.duplicated(keep=False)

# pd.set_option('display.max_rows', None)
# df.sort_values(['person_age', 'person_income', 'person_home_ownership',
#        'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
#        'loan_int_rate', 'loan_status', 'loan_percent_income',
#        'cb_person_default_on_file', 'cb_person_cred_hist_length'])

In [62]:
# # 8. Outliers Detection
# # Boxplot by feature to detect outliers
# for col in df.select_dtypes(include=['float64', 'int64']).columns:
#     plt.figure(figsize=(10, 5))
#     sns.boxplot(x=col, data=df)
#     plt.title(f"Boxplot of {col}")
#     plt.show()


### Question: This data will be used to model classification. Knowing this, is there anything additional that should be looked at during data exploration?

Since you’ll be using the data for classification, there are a few additional steps to improve your data exploration and ensure your dataset is optimized for building a robust model. Here’s what you should check:

1. Class Balance
- Check the balance of the target class to identify any class imbalance, as it can affect model performance.
- Use value_counts() on your target column or plot it with a bar chart to visualize the class distribution.

2. Feature Scaling
- For algorithms sensitive to feature scaling (e.g., logistic regression, SVM, KNN), check the range of numerical features to ensure consistent scaling.
- You can use .describe() to check the range and consider scaling (e.g., MinMaxScaler or StandardScaler) if ranges vary widely.

3. Feature Correlation with Target
- Measure the correlation of each feature with the target label to understand the potential predictive power of each feature.
- For numerical features, you can use .corrwith(target_column), and for categorical, evaluate distributions within each class.

4. Redundant and Low-Variance Features
- Identify highly correlated features and consider dropping one to avoid multicollinearity, which can hinder some models.
- Check for low-variance features, as they may not contribute to classification (use .var() or VarianceThreshold).

5. Categorical Feature Encoding
- Ensure all categorical variables are properly encoded for model training. Use pd.get_dummies() for one-hot encoding or LabelEncoder for ordinal data.
- Avoid high cardinality categorical features, as they can introduce sparsity in the dataset.

6. Handling Outliers Carefully
- Outliers may affect models differently. For tree-based models, they may not be an issue, but for linear models, they can skew results.
- Carefully decide on either treating or retaining outliers based on their impact on the model.

7. Dimensionality Reduction (Optional)
- Consider performing PCA or another dimensionality reduction method if you have many features, especially after one-hot encoding.


In [None]:
# 3. Class Balance for 'loan_status'
print("\nClass Distribution for 'loan_status':\n", df['loan_status'].value_counts())
sns.countplot(x='loan_status', data=df)
plt.title("Class Balance for loan_status")
plt.show()

In [None]:
# 6. Correlation Analysis
# Correlation with 'loan_status'
numerical_features = df.select_dtypes(include=['float64', 'int64']).drop(columns=['loan_status'], errors='ignore')
correlations = numerical_features.corrwith(df['loan_status'])
print("\nCorrelation with loan_status:\n", correlations)

# Visualizing Correlation with Target
correlations.plot(kind='bar', figsize=(10, 6), title="Feature Correlation with loan_status")
plt.show()

# Redundant Features Check - Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(numerical_features.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
# 7. Low Variance Feature Detection
# Apply VarianceThreshold
selector = VarianceThreshold(threshold=0.1)
selector.fit(df.select_dtypes(include=['float64', 'int64']))

# Get the list of retained columns
retained_columns = df.select_dtypes(include=['float64', 'int64']).columns[selector.get_support()]

print("Retained Columns after Low Variance Filtering:\n", retained_columns)


In [None]:
df.select_dtypes(include=['float64', 'int64']).columns

In [52]:
# 8. Encoding Categorical Features
# Encode categorical features with LabelEncoder for simplicity
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])
df.head()


In [60]:
# 9. Feature Scaling
# Scaling numerical features for distance-based models
# Select numerical columns excluding 'loan_status'
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('loan_status')

# Scale only the selected numerical columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])

# Assign the scaled features back to the original dataframe
df[numerical_cols] = scaled_features

In [22]:
# # Optional: Dimensionality Reduction with PCA
# from sklearn.decomposition import PCA
# pca = PCA(n_components=0.95)  # Retain 95% variance
# df_pca = pca.fit_transform(df.drop(columns=['loan_status']))
# print(f"PCA reduced data shape: {df_pca.shape}")
