In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('C:\\Users\\Hiwi\\Documents\\week5\\data.csv')

In [None]:
df.shape


In [None]:
df.info()


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols].hist(bins=30, figsize=(15, 10))

In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    sns.countplot(y=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_percent})
missing_df[missing_df['Missing Count'] > 0]

In [None]:
for col in numerical_cols:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
#import os
#os.chdir('C:/Users/Hiwi/Documents/week5/') 

In [None]:
#!git add .
#!git commit -m "Initial commit: EDA and feature engineering"

In [None]:
#!git remote add origin https://github.com/HiwotWonago/credit-scoring-system.git
#!git branch -M main
#!git push -u origin main

In [None]:
pip install woe

In [None]:
pip install xverse

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from xverse.transformer import MonotonicBinning  # pip install xverse

# 1. Custom Transformer for Aggregation
class CustomerAggregator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.aggregation_dict = {
            'Amount': ['sum', 'mean', 'count', 'std', 'min', 'max'],
            'Value': ['sum', 'mean', 'std'],
            'FraudResult': 'mean'  # Proxy risk score
        }
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Extract datetime features
        X['TransactionStartTime'] = pd.to_datetime(X['TransactionStartTime'])
        X['Hour'] = X['TransactionStartTime'].dt.hour
        X['Day'] = X['TransactionStartTime'].dt.day
        X['Month'] = X['TransactionStartTime'].dt.month
        X['Year'] = X['TransactionStartTime'].dt.year
        
        # Add behavioral features
        X['is_refund'] = (X['Amount'] < 0).astype(int)
        X['is_night'] = X['Hour'].between(0, 6).astype(int)
        
        # Customer-level aggregation
        customer_df = X.groupby('CustomerId').agg(
            total_amount=('Amount', 'sum'),
            avg_amount=('Amount', 'mean'),
            transaction_count=('Amount', 'count'),
            amount_std=('Amount', 'std'),
            refund_rate=('is_refund', 'mean'),
            night_transaction_ratio=('is_night', 'mean'),
            preferred_category=('ProductCategory', lambda x: x.mode()[0]),
            preferred_channel=('ChannelId', lambda x: x.mode()[0])
        ).reset_index()
        
        return customer_df

# 2. Custom Transformer for Feature Extraction
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Create risk segments based on transaction patterns
        X['amount_risk'] = pd.cut(X['avg_amount'], 
                                 bins=[-np.inf, 50, 200, 500, np.inf],
                                 labels=['low', 'medium', 'high', 'very_high'])
        
        # Create customer activity segments
        X['activity_level'] = pd.cut(X['transaction_count'],
                                    bins=[0, 5, 20, 100, np.inf],
                                    labels=['inactive', 'casual', 'active', 'hyperactive'])
        return X

# 3. Build Complete Pipeline
def build_feature_pipeline():
    # Define column types
    numerical_features = ['total_amount', 'avg_amount', 'transaction_count', 
                         'amount_std', 'refund_rate', 'night_transaction_ratio']
    categorical_features = ['preferred_category', 'preferred_channel',
                           'amount_risk', 'activity_level']
    
    # Preprocessing transformers
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, numerical_features),
            ('cat', cat_transformer, categorical_features)
        ])
    
    # Full pipeline
    pipeline = Pipeline(steps=[
        ('aggregator', CustomerAggregator()),
        ('extractor', FeatureExtractor()),
        ('woe_binner', WOETransformer()),  # Weight of Evidence transformation
        ('monotonic_binner', MonotonicBinning()),  # Monotonic binning from xverse
        ('preprocessor', preprocessor)
    ])
    
    return pipeline

# 4. Usage Example (In your training script)
# from src.feature_engineering import build_feature_pipeline
# feature_pipeline = build_feature_pipeline()
# X_processed = feature_pipeline.fit_transform(raw_data)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AggregateTransactionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, id_col='CustomerId', amount_col='Amount'):
        self.id_col = id_col
        self.amount_col = amount_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        agg = X.groupby(self.id_col)[self.amount_col].agg(
            total_amount='sum',
            avg_amount='mean',
            transaction_count='count',
            std_amount='std'
        ).reset_index()
        return agg

class TransactionTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_col='TransactionStartTime'):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])
        X['transaction_hour'] = X[self.datetime_col].dt.hour
        X['transaction_day'] = X[self.datetime_col].dt.day
        X['transaction_month'] = X[self.datetime_col].dt.month
        X['transaction_year'] = X[self.datetime_col].dt.year
        return X

In [None]:
# src/pipelines.py

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from feature_engineering import TransactionTimeFeatures

# ================================
# Define column groups
# ================================

CATEGORICAL_COLS = [
    'CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy'
]

NUMERICAL_COLS = [
    'Amount', 'Value'
]

DATETIME_COL = 'TransactionStartTime'

# ================================
# Pipelines for sub-transforms
# ================================

# Pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ================================
# Preprocessing ColumnTransformer
# ================================

def build_transaction_pipeline():
    preprocessing = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, NUMERICAL_COLS),
        ('cat', categorical_pipeline, CATEGORICAL_COLS)
    ])

    # Final pipeline with datetime features first
    full_pipeline = Pipeline(steps=[
        ('datetime_features', TransactionTimeFeatures(datetime_col=DATETIME_COL)),
        ('preprocessing', preprocessing)
    ])

    return full_pipeline
