In [1]:
# Import all dependencies
import numpy as np
import pandas as pd

In [2]:
# Read the train_identity.csv file
train_identity_df = pd.read_csv('datasets/train_identity.csv')
train_identity_df.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [3]:
# Read the train_transaction.csv file
train_transaction_df = pd.read_csv('datasets/train_transaction.csv')
train_transaction_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Engineering
## Questions
- What does feature engineering contribute to the training of the model?
- What are the different types of feature engineering?
  - Why should you do a specific type of feature engineering?
  - What does it benefit to the performance of the model?
  - When should you do specific types of feature engineering?

## Answers
This process is important because it ensures that garbage in, garbage out doesn't happen. Feature engineering allows us to significantly improve the model performance by refining features. There are various processes in feature engineering.

### Processes
1. Feature Creation - Create new features
   1. Domain-Specific: From industry knowledge like business rules
   2. Data-Driven: Derived from recognized patterns
   3. Synthetic: From combining existing features
2. Feature Transformation - Adjusts features
   1. Normalization & Scaling: Adjust the range of features for consistency
   2. Encoding: Convert categorical data to numerical data (i.e. one-hot encoding)
   3. Mathematical Transformations: Like logarithmic transformations for skewed data
3. Feature Extraction - To reduce dimensionality and simplify model
   1. Dimensionality Reduction: Reduce features while preserving important information (PCA technique)
   2. Aggregation & Combination: Summing/averaging features to simplify the model
4. Feature Selection - Choosing a subset of relevant features to use
   1. Filter methods: Based on statistical measures like correlation
   2. Wrapper methods: Select based on model performance (what?)
   3. Embedded methods: Feature selection integrated within model training (what? it's like not manual feature selection but learned feature selection)
5. Feature Scaling - Ensuring that all features contribute equally to the model
   1. Min-max scaling: Rescales values to a fixed range like 0 to 1
   2. Standard scaling: Normalizes to have a mean of 0 and variance of 1
      - Note: This is done across all features so that there is no bias towards features with inherently larger numerical values like comparing age to salary.

### Steps
1. Data Cleaning
   - Handling missing values (imputation | replace empty cells with mean, mode, or median values from other cells in the same column)
   - Find outliers and handle them
     - Replace with statistical number like max or min
     - Apply transformations to the feature like log or square root
     - Drop the outliers from the dataset
     - Note: What is the best approach for handling outliers in our use-case? I feel like we should keep them
2. Data Transformation
   - Encoding categorical variables
     - One-hot encoding: split up the feature into multiple columns like gender will be male and female
     - Label encoding: Assign a numerical value for each category label
     - Ordinal encoding: Assign numerical value based on the order of the category (if applicable)
     - Target encoding: If a category has multiple target values (what does this mean?), take the mean of the values and assign that to the category
3. Feature Extraction
4. Feature Selection
5. Feature Iteration

Sources:
- [GeekForGeeks - What is Feature Engineering](https://www.geeksforgeeks.org/machine-learning/what-is-feature-engineering/)
- [DataCamp - Feature Engineering in Machine Learning](https://www.datacamp.com/tutorial/feature-engineering)

In [4]:
# Print the first 20 for transactions
print(train_transaction_df.head(20))

    TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0         2987000        0          86400          68.500         W  13926   
1         2987001        0          86401          29.000         W   2755   
2         2987002        0          86469          59.000         W   4663   
3         2987003        0          86499          50.000         W  18132   
4         2987004        0          86506          50.000         H   4497   
5         2987005        0          86510          49.000         W   5937   
6         2987006        0          86522         159.000         W  12308   
7         2987007        0          86529         422.500         W  12695   
8         2987008        0          86535          15.000         H   2803   
9         2987009        0          86536         117.000         W  17399   
10        2987010        0          86549          75.887         C  16496   
11        2987011        0          86555          16.495       

In [5]:
# Print the first 20 for identity
print(train_identity_df.head(20))

    TransactionID  id_01     id_02  id_03  id_04  id_05  id_06  id_07  id_08  \
0         2987004    0.0   70787.0    NaN    NaN    NaN    NaN    NaN    NaN   
1         2987008   -5.0   98945.0    NaN    NaN    0.0   -5.0    NaN    NaN   
2         2987010   -5.0  191631.0    0.0    0.0    0.0    0.0    NaN    NaN   
3         2987011   -5.0  221832.0    NaN    NaN    0.0   -6.0    NaN    NaN   
4         2987016    0.0    7460.0    0.0    0.0    1.0    0.0    NaN    NaN   
5         2987017   -5.0   61141.0    3.0    0.0    3.0    0.0    NaN    NaN   
6         2987022  -15.0       NaN    NaN    NaN    NaN    NaN    NaN    NaN   
7         2987038    0.0   31964.0    0.0    0.0    0.0  -10.0    NaN    NaN   
8         2987040  -10.0  116098.0    0.0    0.0    0.0    0.0    NaN    NaN   
9         2987048   -5.0  257037.0    NaN    NaN    0.0    0.0    NaN    NaN   
10        2987049   -5.0  287959.0    NaN    NaN    1.0  -11.0    NaN    NaN   
11        2987057    0.0   88525.0    Na

In [6]:
# Exploring the datasets
# Find if there's repeating TransactionIds in train_transaction and train_identity
print(f"Number of unique TransactionIds in train_transaction: {train_transaction_df['TransactionID'].nunique()}")
print(f"Number of unique TransactionIds in train_identity: {train_identity_df['TransactionID'].nunique()}")
print(f"Total rows in train_transaction: {len(train_transaction_df)}")
print(f"Total rows in train_identity: {len(train_identity_df)}")

Number of unique TransactionIds in train_transaction: 590540
Number of unique TransactionIds in train_identity: 144233
Total rows in train_transaction: 590540
Total rows in train_identity: 144233


In [7]:
print(train_transaction_df['isFraud'].value_counts(normalize=True) * 100)

isFraud
0    96.500999
1     3.499001
Name: proportion, dtype: float64


# Dataset Exploration

[Kaggle discussion regarding dataset columns](https://www.kaggle.com/competitions/ieee-fraud-detection/discussion/101203)

In [8]:
train_df = pd.merge(train_transaction_df, train_identity_df, on='TransactionID', how='left')
print(train_df.shape)
print(train_df.head())

(590540, 434)
   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   
4        2987004        0          86506            50.0         H   4497   

   card2  card3       card4  card5  ...                id_31  id_32  \
0    NaN  150.0    discover  142.0  ...                  NaN    NaN   
1  404.0  150.0  mastercard  102.0  ...                  NaN    NaN   
2  490.0  150.0        visa  166.0  ...                  NaN    NaN   
3  567.0  150.0  mastercard  117.0  ...                  NaN    NaN   
4  514.0  150.0  mastercard  102.0  ...  samsung browser 6.2   32.0   

       id_33           id_34  id_35 id_36 id_37  id_38  DeviceType  \
0        NaN             N

In [9]:
for col in train_df.columns:
    unique_values = train_df[col].nunique()
    missing_values = train_df[col].isnull().sum()
    if missing_values > 0:
        print(f"Column: {col}, Unique Values: {unique_values}, Missing Values: {missing_values}")

Column: card2, Unique Values: 500, Missing Values: 8933
Column: card3, Unique Values: 114, Missing Values: 1565
Column: card4, Unique Values: 4, Missing Values: 1577
Column: card5, Unique Values: 119, Missing Values: 4259
Column: card6, Unique Values: 4, Missing Values: 1571
Column: addr1, Unique Values: 332, Missing Values: 65706
Column: addr2, Unique Values: 74, Missing Values: 65706
Column: dist1, Unique Values: 2651, Missing Values: 352271
Column: dist2, Unique Values: 1751, Missing Values: 552913
Column: P_emaildomain, Unique Values: 59, Missing Values: 94456
Column: R_emaildomain, Unique Values: 60, Missing Values: 453249
Column: D1, Unique Values: 641, Missing Values: 1269
Column: D2, Unique Values: 641, Missing Values: 280797
Column: D3, Unique Values: 649, Missing Values: 262878
Column: D4, Unique Values: 808, Missing Values: 168922
Column: D5, Unique Values: 688, Missing Values: 309841
Column: D6, Unique Values: 829, Missing Values: 517353
Column: D7, Unique Values: 597, Miss