# **FICO Analytic Challenge Â© Fair Isaac 2024**

# Blind Holdout Set: Generating Features and Scores without Tags

## Mount the Google Drive

In [None]:
import os
import sys
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

path = '/content/drive/MyDrive/FICO Analytic Challenge/'
sys.path.append(path +'Data')
sys.path.append(path +'Model')
sys.path.append(path +'Week 04')
sys.path.append(path +'Week 06')
sys.path.append(path +'Week 07')
# sys.path.append(path +'Week 10')
os.chdir(path)
print(os.getcwd())

### Import the required libraries

In [None]:
# import the necessary libaries
import numpy as np
import pandas as pd
from pickle import dump, load
from fico_functions import *

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import MinMaxScaler
import math

# Pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings('ignore')

# Removing limitation in viewing pandas columns and rows
pd.set_option('display.max_columns', None, 'display.max_rows', None)

In [None]:
# Checking GPU compatibility
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# path to model
mdlPath = f"{path}Model"

# Folder's name that's holding data of interest
data = 'Data'

# Model name; this will be used to distinguish model's output files
model='NNet'

# import scale file
scaleFile = os.path.join(path + data, 'scaler.' + model + '.' + data + ".pkl")

### Blind Holdout Dataset
- **test_C_**<font color='CornflowerBlue'>notags</font>**.csv** is the blind holdout dataset
    - you should have already created the features for it and named it either of the following:
        - **test_C_**<font color='DeepSkyBlue'>notags_features</font>**.csv**
            - if only using features from week 4
        - **test_C_**<font color='lightgreen'>notags_advanced_features</font>**.csv**
            - if also using week 8
- **score.NNet.test_C_**<font color='DeepSkyBlue'>notags_features</font>**.csv** or **score.NNet.test_C_**<font color='lightgreen'>notags_advanced_features</font>**.csv**
    - this should have scores from your trained NNet model
    - this dataset doesn't have the following columns since it has "<font color='CornflowerBlue'>**notags**</font>"
        - mdlIsFraudTrx
        - mdlIsFraudAcct
- <font color='Cyan'>**score.NNet.test_C_features.csv**</font> or <font color='MediumPurple'>**score.NNet.test_C_advanced_features.csv**</font>
    - this is the file's name that we'll return to you which includes the tags

In [None]:
# def get_blindholdout_file(path, data, model, blindholdoutFile, featureTestFileSuffix):
#   # Blind Holdout file location
#   blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

#   if not os.path.isfile(blindholdoutCSV):
#       featureTestFileSuffix="_features.csv"
#       blindholdoutCSV = os.path.join(path + data, blindholdoutFile[0] + featureTestFileSuffix)

#       if not os.path.isfile(blindholdoutCSV):
#           raise FileNotFoundError(f"{blindholdoutCSV} does not exist in {path}{data} directory")

#       return blindholdoutCSV, featureTestFileSuffix

In [None]:
# def get_feature_cols(df1, blindholdoutFile):
#     base_columns = ['pan', 'merchant', 'category', 'transactionAmount', 'first', 'last', 'mdlIsFraudTrx', 'mdlIsFraudAcct',
#                 'is_train', 'cardholderCountry', 'cardholderState', 'transactionDateTime', 'gender',
#                 'street', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
#                 'merch_lat', 'merch_long', 'merchCountry', 'merchState', 'deltaTime', 'y_preds', 'score']

#     blind = False
#     del_cols = []
#     for col in base_columns:
#         if col not in df1.columns:
#             del_cols.append(col)
#             # base_cols.remove(col)
#             if col == 'mdlIsFraudTrx':
#                 blind = True

#     base_cols = list(set(base_columns) - set(del_cols))
#     feature_columns = list(set(df1.columns) - set(base_cols))
#     feature_columns.sort()

#     print(f'\033[1mColumns in\033[0m: {blindholdoutFile}')
#     print(f"Number of Columns in Base: {len(base_cols)}")
#     print(f"Base Columns: {base_cols}")
#     print(f"Number of Features: {len(feature_columns)}")
#     print(f"Input Features: {feature_columns}")

#     if not blind:
#         label_column = ["mdlIsFraudTrx"]
#         print(f"Label Column: {label_column}\n")
#         return feature_columns, label_column, base_cols
#     else:
#         label_column = [""]
#         return feature_columns, label_column, base_cols

In [None]:
# Change to correct file name
blindholdoutFile = ['test_C_notags']
# CSV filename suffex
featureTestFileSuffix="_advanced_features.csv"

In [None]:
blindholdoutCSV, featureTestFileSuffix = get_blindholdout_file(path, data, model, blindholdoutFile, featureTestFileSuffix)

In [None]:
# Holdout file save directory
blindholdoutsaveCSV = os.path.join(path + data, 'score.' + model + '.' + blindholdoutFile[0] + featureTestFileSuffix)

In [None]:
# test dataset
df_test = import_df(blindholdoutCSV)

In [None]:
df_test.head()

### Lists Containing Names of Input Features and Label Columns <font color='red'>(**Modify base_cols to match your dataset's columns that aren't features**)</font>

In [None]:
len(df_test.columns)

In [None]:
feature_columns, label_column, base_cols = get_feature_cols(df_test, blindholdoutFile[0])

In [None]:
# Features to save
saveFeatures_blind = [*base_cols, *feature_columns, 'y_preds', 'score']
print(f"Features to save: {saveFeatures_blind}")

In [None]:
#hyper-parameters
dropout_rate = 0.2
num_hidden_units = 4

In [None]:
# Import best LAUC Model
laucModel = os.path.join(mdlPath, "model_best_valid_lauc."+str(num_hidden_units)+"nodes."+ data +".pt")
model_l_NNet = NNet(input_size=len(feature_columns), hidden_units=num_hidden_units, output_size=1, dropout=dropout_rate)
model_l_NNet.load_state_dict(torch.load(laucModel))
model_l_NNet.to(device)
model_l_NNet.eval()

In [None]:
df_blind_holdout = blind_holdout_score_NNet(blindholdoutCSV, scaleFile, feature_columns, device, model_l_NNet, saveFeatures_blind, blindholdoutsaveCSV)
df_blind_holdout.head()