## feature engineering process for your credit scoring model

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
# Load the data
df = pd.read_csv("E:/KAIM 2/KAIM 2 Week 6/Week-6/data/cleaned_credit_data.csv")

In [4]:
# Convert TransactionStartTime to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [5]:
# 1. Create Aggregate Features
print("1. Creating Aggregate Features")
customer_agg = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'count', 'std']
}).reset_index()
customer_agg.columns = ['CustomerId', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionCount', 'TransactionAmountStd']
print(customer_agg.head())

1. Creating Aggregate Features
        CustomerId  TotalTransactionAmount  AverageTransactionAmount  \
0     CustomerId_1                -10000.0             -10000.000000   
1    CustomerId_10                -10000.0             -10000.000000   
2  CustomerId_1001                 20000.0               4000.000000   
3  CustomerId_1002                  4225.0                384.090909   
4  CustomerId_1003                 20000.0               3333.333333   

   TransactionCount  TransactionAmountStd  
0                 1                   NaN  
1                 1                   NaN  
2                 5           6558.963333  
3                11            560.498966  
4                 6           6030.478146  


In [6]:
# 2. Extract Features
print("\n2. Extracting Time-based Features")
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year
print(df[['TransactionStartTime', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']].head())


2. Extracting Time-based Features
       TransactionStartTime  TransactionHour  TransactionDay  \
0 2018-11-15 02:18:49+00:00                2              15   
1 2018-11-15 02:19:08+00:00                2              15   
2 2018-11-15 02:44:21+00:00                2              15   
3 2018-11-15 03:32:55+00:00                3              15   
4 2018-11-15 03:34:21+00:00                3              15   

   TransactionMonth  TransactionYear  
0                11             2018  
1                11             2018  
2                11             2018  
3                11             2018  
4                11             2018  


In [12]:
import sklearn
print(sklearn.__version__)

1.5.1


In [16]:
# 3. Encode Categorical Variables
#print("\n3. Encoding Categorical Variables")
# Label Encoding
le = LabelEncoder()
label_encode_columns = ['ProductCategory', 'ChannelId', 'ProviderId', 'ProductId']
for col in label_encode_columns:
    df[f'{col}_encoded'] = le.fit_transform(df[col])

In [18]:
pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -

  You can safely remove it manually.


In [20]:
# One-Hot Encoding
onehot_columns = ['CurrencyCode', 'CountryCode']
#onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_encoded = onehot_encoder.fit_transform(df[onehot_columns])
#onehot_columns_names = onehot_encoder.get_feature_names(onehot_columns)
onehot_columns_names = onehot_encoder.get_feature_names_out(onehot_columns)
df_onehot = pd.DataFrame(onehot_encoded, columns=onehot_columns_names, index=df.index)
df = pd.concat([df, df_onehot], axis=1)

In [28]:
print("\n3. Encoding Categorical Variables")
print("Columns after encoding:")
print(df.columns)


3. Encoding Categorical Variables
Columns after encoding:
Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'TransactionHour', 'TransactionDay', 'TransactionMonth',
       'TransactionYear', 'ProductCategory_encoded', 'ChannelId_encoded',
       'ProviderId_encoded', 'ProductId_encoded', 'CurrencyCode_UGX',
       'CountryCode_256'],
      dtype='object')


In [22]:
# 4. Handle Missing Values
print("\n4. Handling Missing Values")
print("Missing values before imputation:")
print(df.isnull().sum())


4. Handling Missing Values
Missing values before imputation:
TransactionId              0
BatchId                    0
AccountId                  0
SubscriptionId             0
CustomerId                 0
CurrencyCode               0
CountryCode                0
ProviderId                 0
ProductId                  0
ProductCategory            0
ChannelId                  0
Amount                     0
Value                      0
TransactionStartTime       0
PricingStrategy            0
FraudResult                0
TransactionHour            0
TransactionDay             0
TransactionMonth           0
TransactionYear            0
ProductCategory_encoded    0
ChannelId_encoded          0
ProviderId_encoded         0
ProductId_encoded          0
CurrencyCode_UGX           0
CountryCode_256            0
dtype: int64


In [23]:
# Imputation
numeric_columns = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

In [24]:
print("\nMissing values after imputation:")
print(df.isnull().sum())


Missing values after imputation:
TransactionId              0
BatchId                    0
AccountId                  0
SubscriptionId             0
CustomerId                 0
CurrencyCode               0
CountryCode                0
ProviderId                 0
ProductId                  0
ProductCategory            0
ChannelId                  0
Amount                     0
Value                      0
TransactionStartTime       0
PricingStrategy            0
FraudResult                0
TransactionHour            0
TransactionDay             0
TransactionMonth           0
TransactionYear            0
ProductCategory_encoded    0
ChannelId_encoded          0
ProviderId_encoded         0
ProductId_encoded          0
CurrencyCode_UGX           0
CountryCode_256            0
dtype: int64


In [26]:
# 5. Normalize/Standardize Numerical Features
print("\n5. Normalizing/Standardizing Numerical Features")
scaler = StandardScaler()
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
print("Sample of standardized numerical features:")
print(df[numeric_columns].head())


5. Normalizing/Standardizing Numerical Features
Sample of standardized numerical features:
   CountryCode    Amount     Value  PricingStrategy  FraudResult  \
0          0.0 -0.046371 -0.072291        -0.349252    -0.044962   
1          0.0 -0.054643 -0.080251        -0.349252    -0.044962   
2          0.0 -0.050426 -0.076352        -0.349252    -0.044962   
3          0.0  0.107717  0.096648        -0.349252    -0.044962   
4          0.0 -0.059704 -0.075183        -0.349252    -0.044962   

   TransactionHour  TransactionDay  TransactionMonth  TransactionYear  \
0        -2.155530       -0.100739          0.848684        -0.994246   
1        -2.155530       -0.100739          0.848684        -0.994246   
2        -2.155530       -0.100739          0.848684        -0.994246   
3        -1.949214       -0.100739          0.848684        -0.994246   
4        -1.949214       -0.100739          0.848684        -0.994246   

   ProductCategory_encoded  ChannelId_encoded  ProviderId_en

In [30]:
# Save the processed dataset
df.to_csv("E:/KAIM 2/KAIM 2 Week 6/Week-6/data/processed_data.csv", index=False)
print("\nProcessed data saved to 'E:/KAIM 2/KAIM 2 Week 6/Week-6/data/processed_data.csv'")


Processed data saved to 'E:/KAIM 2/KAIM 2 Week 6/Week-6/data/processed_data.csv'


Key Points:

Aggregate features provide a summary of customer behavior, which can be crucial for credit scoring.
Time-based features can help identify patterns in transaction timing.
Encoding categorical variables allows them to be used in machine learning models.
Handling missing values ensures that all data points can be used in the model.
Standardizing numerical features puts all variables on the same scale, which is important for many machine learning algorithms.