In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/raw/Finscope_sample.csv")

# Identify columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# ID-like columns: numeric but unique per row
id_like_cols = [col for col in num_cols if df[col].nunique() == len(df)]
num_cols = [col for col in num_cols if col not in id_like_cols]

print(f"📊 Numerical columns ({len(num_cols)}): {num_cols[:10]} ...")
print(f"🔠 Categorical columns ({len(cat_cols)}): {cat_cols[:10]} ...")
print(f"🆔 Identifier-like columns: {id_like_cols}")

# Missing values summary
missing_summary = df.isnull().sum().sort_values(ascending=False)
print("\n❓ Missing Values Summary:")
print(missing_summary[missing_summary > 0])


📊 Numerical columns (42): ['HH_WEIGHT16', 'BENCHWGT_PP', 'EACode', 'SP_code', 'VisitPnt', 'MainHouseHold_1_Age', 'Number_in_HH', 'Adults_in_HH', 'C8_1', 'C8_2'] ...
🔠 Categorical columns (3263): ['SP_name', 'Province', 'EA_GTI_GType3', 'EA_GTI_GType6', 'Area_Type_MDB', 'AC3_AreaType_Kantar', 'EA_Sub', 'MainHouseHold_2_Age', 'MainHouseHold_3_Age', 'MainHouseHold_4_Age'] ...
🆔 Identifier-like columns: ['ID', 'ID_working']

❓ Missing Values Summary:
L2_1    86
L2_3    84
L2_7    81
L2_2    71
E7      39
dtype: int64


In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import os

# Paths
raw_path = "../data/raw/Finscope_sample.csv"
processed_path = "../data/processed/processed_Finscope_sample.csv"

# Load dataset
df = pd.read_csv(raw_path)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Define patterns for numeric and categorical features
numeric_patterns = ["Age", "Income", "Loan", "Previous Loan History", "Score", "Amount", "Time"]
categorical_patterns = ["Gender", "Race", "Province", "Marital", "Employment", "Education", "Approval"]

# Automatically detect numeric columns
numeric_cols = [col for col in df.select_dtypes(include=["number"]).columns
                if any(pat in col for pat in numeric_patterns)]

# Automatically detect categorical columns
categorical_cols = [col for col in df.select_dtypes(include=["object", "category"]).columns
                    if any(pat in col for pat in categorical_patterns)]

# Fill missing values
if numeric_cols:
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
if categorical_cols:
    df[categorical_cols] = df[categorical_cols].fillna("Unknown")

# One-hot encode categorical columns
if categorical_cols:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    encoded_cat = encoder.fit_transform(df[categorical_cols])
    encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(categorical_cols))
else:
    encoded_cat_df = pd.DataFrame()

# Scale numeric columns
if numeric_cols:
    scaler = StandardScaler()
    scaled_num = scaler.fit_transform(df[numeric_cols])
    scaled_num_df = pd.DataFrame(scaled_num, columns=numeric_cols)
else:
    scaled_num_df = pd.DataFrame()

# Detect target columns (example targets)
target_cols = ["Credit Score", "Loan Approval"]
existing_target_cols = [col for col in target_cols if col in df.columns]

# Combine processed features and targets
processed_df = pd.concat([scaled_num_df, encoded_cat_df, df[existing_target_cols].reset_index(drop=True)], axis=1)

# Ensure processed folder exists
os.makedirs(os.path.dirname(processed_path), exist_ok=True)

# Save preprocessed CSV
processed_df.to_csv(processed_path, index=False)
print(f"✅ Preprocessing complete. Saved to {processed_path}")
print(f"Numeric columns included: {numeric_cols}")
print(f"Categorical columns included: {categorical_cols}")
print(f"Target columns included: {existing_target_cols}")


✅ Preprocessing complete. Saved to ../data/processed/processed_Finscope_sample.csv
Numeric columns included: ['MainHouseHold_1_Age', 'Resp_Age', 'LSM_2014_TotalScore']
Categorical columns included: ['Province', 'MainHouseHold_1_Gender', 'MainHouseHold_2_Gender', 'MainHouseHold_3_Gender', 'MainHouseHold_4_Gender', 'MainHouseHold_5_Gender', 'MainHouseHold_6_Gender', 'MainHouseHold_7_Gender', 'MainHouseHold_8_Gender', 'MainHouseHold_9_Gender', 'MainHouseHold_10_Gender', 'MainHouseHold_1_Race', 'MainHouseHold_2_Race', 'MainHouseHold_3_Race', 'MainHouseHold_4_Race', 'MainHouseHold_5_Race', 'MainHouseHold_6_Race', 'MainHouseHold_7_Race', 'MainHouseHold_8_Race', 'MainHouseHold_9_Race', 'MainHouseHold_10_Race', 'Resp_Gender', 'Resp_Race']
Target columns included: []
