## feature engineering process for your credit scoring model

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
# Load the data
df = pd.read_csv("data.csv")

In [None]:
# Convert TransactionStartTime to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [None]:
# 1. Create Aggregate Features
print("1. Creating Aggregate Features")
customer_agg = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'count', 'std']
}).reset_index()
customer_agg.columns = ['CustomerId', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionCount', 'TransactionAmountStd']
print(customer_agg.head())

In [None]:
# 2. Extract Features
print("\n2. Extracting Time-based Features")
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year
print(df[['TransactionStartTime', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']].head())

In [None]:
# 3. Encode Categorical Variables
print("\n3. Encoding Categorical Variables")
# Label Encoding
le = LabelEncoder()
label_encode_columns = ['ProductCategory', 'ChannelId', 'ProviderId', 'ProductId']
for col in label_encode_columns:
    df[f'{col}_encoded'] = le.fit_transform(df[col])

In [None]:
# One-Hot Encoding
onehot_columns = ['CurrencyCode', 'CountryCode']
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoded = onehot_encoder.fit_transform(df[onehot_columns])
onehot_columns_names = onehot_encoder.get_feature_names(onehot_columns)
df_onehot = pd.DataFrame(onehot_encoded, columns=onehot_columns_names, index=df.index)
df = pd.concat([df, df_onehot], axis=1)

In [None]:
print("Columns after encoding:")
print(df.columns)

In [None]:
# 4. Handle Missing Values
print("\n4. Handling Missing Values")
print("Missing values before imputation:")
print(df.isnull().sum())

In [None]:
# Imputation
numeric_columns = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

In [None]:
print("\nMissing values after imputation:")
print(df.isnull().sum())

In [None]:
# 5. Normalize/Standardize Numerical Features
print("\n5. Normalizing/Standardizing Numerical Features")
scaler = StandardScaler()
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [None]:
print("Sample of standardized numerical features:")
print(df[numeric_columns].head())

In [None]:
# Save the processed dataset
df.to_csv("processed_data.csv", index=False)
print("\nProcessed data saved to 'processed_data.csv'")

Key Points:

Aggregate features provide a summary of customer behavior, which can be crucial for credit scoring.
Time-based features can help identify patterns in transaction timing.
Encoding categorical variables allows them to be used in machine learning models.
Handling missing values ensures that all data points can be used in the model.
Standardizing numerical features puts all variables on the same scale, which is important for many machine learning algorithms.