# **FICO Analytic Challenge Â© Fair Isaac 2024**

# Blind Holdout Set: Generating Features and Scores without Tags

## Mount the Google Drive

In [1]:
import os
import sys
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

path = '/content/drive/MyDrive/FICO Analytic Challenge/'
sys.path.append(path +'Data')
sys.path.append(path +'Model')
sys.path.append(path +'Week_04')
sys.path.append(path +'Week_06')
sys.path.append(path +'Week_07')
sys.path.append(path +'Week_10')
os.chdir(path)
print(os.getcwd())

Mounted at /content/drive/
/content/drive/MyDrive/FICO Analytic Challenge


### Import the required libraries

In [2]:
# import the necessary libaries
import numpy as np
import pandas as pd
from pickle import dump, load
from fico_functions import *

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import MinMaxScaler
import math

# Pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings('ignore')

# Removing limitation in viewing pandas columns and rows
pd.set_option('display.max_columns', None, 'display.max_rows', None)

In [3]:
# Checking GPU compatibility
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

No GPU available. Training will run on CPU.


In [4]:
# path to model
mdlPath = f"{path}Model"

# Folder's name that's holding data of interest
data = 'Data'

# Model name; this will be used to distinguish model's output files
model='NNet'

# import scale file
scaleFile = os.path.join(path + data, 'scaler.' + model + '.' + data + ".pkl")

### Blind Holdout Dataset
- **test_C_**<font color='CornflowerBlue'>notags</font>**.csv** is the blind holdout dataset
    - you should have already created the features for it and named it either of the following:
        - **test_C_**<font color='DeepSkyBlue'>notags_features</font>**.csv**
            - if only using features from week 4
        - **test_C_**<font color='lightgreen'>notags_advanced_features</font>**.csv**
            - if also using week 8
- **score.NNet.test_C_**<font color='DeepSkyBlue'>notags_features</font>**.csv** or **score.NNet.test_C_**<font color='lightgreen'>notags_advanced_features</font>**.csv**
    - this should have scores from your trained NNet model
    - this dataset doesn't have the following columns since it has "<font color='CornflowerBlue'>**notags**</font>"
        - mdlIsFraudTrx
        - mdlIsFraudAcct
- <font color='Cyan'>**score.NNet.test_C_features.csv**</font> or <font color='MediumPurple'>**score.NNet.test_C_advanced_features.csv**</font>
    - this is the file's name that we'll return to you which includes the tags

In [5]:
# def get_blindholdout_file(path, data, model, blindholdoutFile, featureTestFileSuffix):
#   # Blind Holdout file location
#   blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

#   if not os.path.isfile(blindholdoutCSV):
#       featureTestFileSuffix="_features.csv"
#       blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

#       if not os.path.isfile(blindholdoutCSV):
#           raise FileNotFoundError(f"{blindholdoutCSV} does not exist in {path}{data} directory")

#       return blindholdoutCSV, featureTestFileSuffix

In [6]:
# def get_feature_cols(df1, blindholdoutFile):
#     base_columns = ['pan', 'merchant', 'category', 'transactionAmount', 'first', 'last', 'mdlIsFraudTrx', 'mdlIsFraudAcct',
#                 'is_train', 'cardholderCountry', 'cardholderState', 'transactionDateTime', 'gender',
#                 'street', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
#                 'merch_lat', 'merch_long', 'merchCountry', 'merchState', 'deltaTime', 'y_preds', 'score']

#     blind = False
#     del_cols = []
#     for col in base_columns:
#         if col not in df1.columns:
#             del_cols.append(col)
#             # base_cols.remove(col)
#             if col == 'mdlIsFraudTrx':
#                 blind = True

#     base_cols = list(set(base_columns) - set(del_cols))
#     feature_columns = list(set(df1.columns) - set(base_cols))
#     feature_columns.sort()

#     print(f'\033[1mColumns in\033[0m: {blindholdoutFile}')
#     print(f"Number of Columns in Base: {len(base_cols)}")
#     print(f"Base Columns: {base_cols}")
#     print(f"Number of Features: {len(feature_columns)}")
#     print(f"Input Features: {feature_columns}")

#     if not blind:
#         label_column = ["mdlIsFraudTrx"]
#         print(f"Label Column: {label_column}\n")
#         return feature_columns, label_column, base_cols
#     else:
#         label_column = [""]
#         return feature_columns, label_column, base_cols

In [7]:
# Change to correct file name
blindholdoutFile = ['test_C_notags']
# CSV filename suffex
featureTestFileSuffix="_advanced_features.csv"

In [12]:
def get_blindholdout_file(path, data, model, blindholdoutFile, featureTestFileSuffix):
  # Blind Holdout file location
    blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

  # if not os.path.isfile(blindholdoutCSV):
  #     featureTestFileSuffix="_features.csv"
  #     blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

  #     if not os.path.isfile(blindholdoutCSV):
  #         raise FileNotFoundError(f"{blindholdoutCSV} does not exist in {path}{data} directory")

    return blindholdoutCSV, featureTestFileSuffix

In [13]:
blindholdoutCSV, featureTestFileSuffix = get_blindholdout_file(path, data, model, blindholdoutFile, featureTestFileSuffix)

In [14]:
# Holdout file save directory
blindholdoutsaveCSV = os.path.join(path + data, 'score.' + model + '.' + blindholdoutFile[0] + featureTestFileSuffix)

In [15]:
# test dataset
df_test = import_df(blindholdoutCSV)

In [16]:
df_test.head()

Unnamed: 0,pan,merchant,category,transactionAmount,first,last,is_train,cardholderCountry,cardholderState,transactionDateTime,gender,street,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,merchCountry,merchState,deltaTime,1m,AvgTransactionAmount_Last7Days,CNP_High,HighValue_International,IS_0_TO_5AM,IsHighValue,IsSpent,IsTransaction,Is_High_Low,RelativeAmount,amount_diff,amt_trend_24h,amt_trend_5e,average_spending,category_ratio,count_trend_1h,ewm_1D,high_interaction,is_cnp,is_grocery_pos,is_international,is_late_night,is_travel,monday_buy,num_hi_amt_last_hour,num_last_24_hours,outside_country,outside_state_purchase,ratio_14D_to_60D,ratio_30D_to_60D,repeat_amt,repeat_hi_amt_1H,rolling_mean_14D,rolling_mean_30D,rolling_mean_60D,spending_above_threshold,spending_below_avg_20,transactionHour,transactionHour_Risk,user_avg_amount
0,0002898353840886A,Brekke and Sons,gas_transport,68.22,Charles,Copeland,0,US,Florida,2020-05-04 09:58:28,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,c772e20a62ce9e1aaef80a9d6f8e14ed,1325671108,25.784525,-79.915949,US,Florida,0.0,68.22,68.22,0,0,0,0,1,1,-1,0.0,0.0,68.22,0.0,81.746804,0.0,1.0,68.22,0,0,0,0,0,False,0,0.0,1.0,0,0,1.0,1.0,0,68.22,68.22,68.22,68.22,0,0,9,0,
1,0002898353840886A,Douglas-White,entertainment,4.03,Charles,Copeland,0,US,Florida,2020-05-05 08:58:09,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,14256aa985f72b9ef6a198efba2cfc1f,1325753889,26.367189,-79.473664,BS,West Grand Bahama,82781.0,4.03,36.125,0,0,0,0,1,0,-1,0.059074,-64.19,36.125,0.0,81.746804,0.0,1.0,36.125,1,0,0,1,0,False,1,0.0,2.0,1,1,1.0,1.0,0,4.03,36.125,36.125,36.125,0,1,8,0,68.22
2,0002898353840886A,"Ritchie, Oberbrunner and Cremin",travel,3.43,Charles,Copeland,0,US,Florida,2020-05-05 17:31:26,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,0718d61a07e2cfe420818b653fc31d6b,1325784686,26.459335,-80.743962,US,Florida,30797.0,3.43,25.226667,0,0,0,0,1,0,-1,0.094948,-0.6,3.73,0.0,81.746804,0.0,1.0,19.7775,1,0,0,0,0,False,1,0.0,2.0,0,0,1.0,1.0,0,3.43,25.226667,25.226667,25.226667,0,1,17,0,36.125
3,0002898353840886A,"Schumm, Bauch and Ondricka",grocery_pos,59.77,Charles,Copeland,0,US,Florida,2020-05-07 02:39:08,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,d44aeb589ba8e2d58ba320fe1ecf139b,1325903948,26.817313,-80.267861,US,Florida,119262.0,59.77,33.8625,0,0,1,0,1,1,-1,2.369318,56.34,59.77,0.0,81.746804,0.0,1.0,39.77375,1,0,1,0,1,False,0,0.0,1.0,0,0,1.0,1.0,0,59.77,33.8625,33.8625,33.8625,0,1,2,1,25.226667
4,0002898353840886A,"Welch, Rath and Koepp",entertainment,73.06,Charles,Copeland,0,US,Florida,2020-05-07 11:53:26,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,139db024ddf70d4b8b76a896c504aa75,1325937206,27.220689,-80.339584,US,Florida,33258.0,73.06,41.702,0,0,0,0,1,1,-1,2.157549,13.29,66.415,41.702,81.746804,0.0,1.0,56.416875,1,0,0,0,0,False,0,0.0,2.0,0,0,1.0,1.0,0,73.06,41.702,41.702,41.702,0,0,11,0,33.8625


### Lists Containing Names of Input Features and Label Columns <font color='red'>(**Modify base_cols to match your dataset's columns that aren't features**)</font>

In [17]:
len(df_test.columns)

65

In [18]:
feature_columns, label_column, base_cols = get_feature_cols(df_test, blindholdoutFile[0])

[1mColumns in[0m: test_C_notags
Number of Columns in Base: 25
Base Columns: ['lat', 'category', 'transactionDateTime', 'cardholderCountry', 'pan', 'city_pop', 'trans_num', 'street', 'merchCountry', 'is_train', 'gender', 'zip', 'merchant', 'long', 'last', 'first', 'job', 'merch_long', 'cardholderState', 'dob', 'unix_time', 'merchState', 'transactionAmount', 'deltaTime', 'merch_lat']
Number of Features: 40
Input Features: ['1m', 'AvgTransactionAmount_Last7Days', 'CNP_High', 'HighValue_International', 'IS_0_TO_5AM', 'IsHighValue', 'IsSpent', 'IsTransaction', 'Is_High_Low', 'RelativeAmount', 'amount_diff', 'amt_trend_24h', 'amt_trend_5e', 'average_spending', 'category_ratio', 'count_trend_1h', 'ewm_1D', 'high_interaction', 'is_cnp', 'is_grocery_pos', 'is_international', 'is_late_night', 'is_travel', 'monday_buy', 'num_hi_amt_last_hour', 'num_last_24_hours', 'outside_country', 'outside_state_purchase', 'ratio_14D_to_60D', 'ratio_30D_to_60D', 'repeat_amt', 'repeat_hi_amt_1H', 'rolling_mean

In [19]:
# Features to save
saveFeatures_blind = [*base_cols, *feature_columns, 'y_preds', 'score']
print(f"Features to save: {saveFeatures_blind}")

Features to save: ['lat', 'category', 'transactionDateTime', 'cardholderCountry', 'pan', 'city_pop', 'trans_num', 'street', 'merchCountry', 'is_train', 'gender', 'zip', 'merchant', 'long', 'last', 'first', 'job', 'merch_long', 'cardholderState', 'dob', 'unix_time', 'merchState', 'transactionAmount', 'deltaTime', 'merch_lat', '1m', 'AvgTransactionAmount_Last7Days', 'CNP_High', 'HighValue_International', 'IS_0_TO_5AM', 'IsHighValue', 'IsSpent', 'IsTransaction', 'Is_High_Low', 'RelativeAmount', 'amount_diff', 'amt_trend_24h', 'amt_trend_5e', 'average_spending', 'category_ratio', 'count_trend_1h', 'ewm_1D', 'high_interaction', 'is_cnp', 'is_grocery_pos', 'is_international', 'is_late_night', 'is_travel', 'monday_buy', 'num_hi_amt_last_hour', 'num_last_24_hours', 'outside_country', 'outside_state_purchase', 'ratio_14D_to_60D', 'ratio_30D_to_60D', 'repeat_amt', 'repeat_hi_amt_1H', 'rolling_mean_14D', 'rolling_mean_30D', 'rolling_mean_60D', 'spending_above_threshold', 'spending_below_avg_20', 

In [20]:
#hyper-parameters
dropout_rate = 0.2
num_hidden_units = 4

In [21]:
# Import best LAUC Model
laucModel = os.path.join(mdlPath, "model_best_valid_lauc."+str(num_hidden_units)+"nodes."+ data +".pt")
model_l_NNet = NNet(input_size=len(feature_columns), hidden_units=num_hidden_units, output_size=1, dropout=dropout_rate)
model_l_NNet.load_state_dict(torch.load(laucModel))
model_l_NNet.to(device)
model_l_NNet.eval()

NNet(
  (inputLayer): Linear(in_features=40, out_features=4, bias=True)
  (hiddenLayer): Linear(in_features=4, out_features=4, bias=True)
  (outputLayer): Linear(in_features=4, out_features=1, bias=True)
  (batchNorm1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchNorm2): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (tanh): Tanh()
  (sigmoid): Sigmoid()
)

In [22]:
df_blind_holdout = blind_holdout_score_NNet(blindholdoutCSV, scaleFile, feature_columns, device, model_l_NNet, saveFeatures_blind, blindholdoutsaveCSV)
df_blind_holdout.head()

Unnamed: 0,pan,merchant,category,transactionAmount,first,last,is_train,cardholderCountry,cardholderState,transactionDateTime,gender,street,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,merchCountry,merchState,deltaTime,1m,AvgTransactionAmount_Last7Days,CNP_High,HighValue_International,IS_0_TO_5AM,IsHighValue,IsSpent,IsTransaction,Is_High_Low,RelativeAmount,amount_diff,amt_trend_24h,amt_trend_5e,average_spending,category_ratio,count_trend_1h,ewm_1D,high_interaction,is_cnp,is_grocery_pos,is_international,is_late_night,is_travel,monday_buy,num_hi_amt_last_hour,num_last_24_hours,outside_country,outside_state_purchase,ratio_14D_to_60D,ratio_30D_to_60D,repeat_amt,repeat_hi_amt_1H,rolling_mean_14D,rolling_mean_30D,rolling_mean_60D,spending_above_threshold,spending_below_avg_20,transactionHour,transactionHour_Risk,user_avg_amount,y_preds,score
0,0002898353840886A,Brekke and Sons,gas_transport,68.22,Charles,Copeland,0,US,Florida,2020-05-04 09:58:28,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,c772e20a62ce9e1aaef80a9d6f8e14ed,1325671108,25.784525,-79.915949,US,Florida,0.0,68.22,68.22,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,0.0,0.0,68.22,0.0,81.746804,0.0,1.0,68.22,1.110223e-16,0.0,0.0,0.0,0.0,0.0,0.0,-5.5511150000000004e-17,1.0,0.0,0.0,1.0,1.0,-5.421011e-20,68.22,68.22,68.22,68.22,0.0,0.0,9.0,0.0,0.0,0.000112,15
1,0002898353840886A,Douglas-White,entertainment,4.03,Charles,Copeland,0,US,Florida,2020-05-05 08:58:09,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,14256aa985f72b9ef6a198efba2cfc1f,1325753889,26.367189,-79.473664,BS,West Grand Bahama,82781.0,4.03,36.125,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.059074,-64.19,36.125,0.0,81.746804,0.0,1.0,36.125,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-5.5511150000000004e-17,2.0,1.0,1.0,1.0,1.0,-5.421011e-20,4.03,36.125,36.125,36.125,0.0,1.0,8.0,0.0,68.22,0.001104,178
2,0002898353840886A,"Ritchie, Oberbrunner and Cremin",travel,3.43,Charles,Copeland,0,US,Florida,2020-05-05 17:31:26,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,0718d61a07e2cfe420818b653fc31d6b,1325784686,26.459335,-80.743962,US,Florida,30797.0,3.43,25.226667,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.094948,-0.6,3.73,0.0,81.746804,0.0,1.0,19.7775,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-5.5511150000000004e-17,2.0,0.0,0.0,1.0,1.0,-5.421011e-20,3.43,25.226667,25.226667,25.226667,0.0,1.0,17.0,0.0,36.125,0.000641,139
3,0002898353840886A,"Schumm, Bauch and Ondricka",grocery_pos,59.77,Charles,Copeland,0,US,Florida,2020-05-07 02:39:08,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,d44aeb589ba8e2d58ba320fe1ecf139b,1325903948,26.817313,-80.267861,US,Florida,119262.0,59.77,33.8625,0.0,0.0,1.0,0.0,1.0,1.0,-1.0,2.369318,56.34,59.77,0.0,81.746804,0.0,1.0,39.77375,1.0,0.0,1.0,0.0,1.0,0.0,0.0,-5.5511150000000004e-17,1.0,0.0,0.0,1.0,1.0,-5.421011e-20,59.77,33.8625,33.8625,33.8625,0.0,1.0,2.0,1.0,25.226667,0.364592,621
4,0002898353840886A,"Welch, Rath and Koepp",entertainment,73.06,Charles,Copeland,0,US,Florida,2020-05-07 11:53:26,M,92213 Lee Well,33404,26.7832,-80.0638,459921,"Administrator, arts",1969-09-08,139db024ddf70d4b8b76a896c504aa75,1325937206,27.220689,-80.339584,US,Florida,33258.0,73.06,41.702,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,2.157549,13.29,66.415,41.702,81.746804,0.0,1.0,56.416875,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.5511150000000004e-17,2.0,0.0,0.0,1.0,1.0,-5.421011e-20,73.06,41.702,41.702,41.702,0.0,0.0,11.0,0.0,33.8625,0.000237,68
