# Credit Card Fraud Analysis

# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter


# Models 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier

# Data Processing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import entropy

# MLP
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Model Processing
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer

# Functions

In [None]:
def spot_check_models(X_train_scaled, X_test_scaled, y_train_encoded, y_test_encoded):
    """
    This a quick way to spot check relevant algorithms to gain an understanding of 
    the dataset and which models handle the distribution well.

    Args:
        X_train_scaled (_type_): _description_
        X_test_scaled (_type_): _description_
        y_train_encoded (_type_): _description_
        y_test_encoded (_type_): _description_

    Returns:
        _type_: Sorted dataframe on accuracy scores.
    """
    models = {
        
        "GaussianNB": GaussianNB(),
        "LDA":LinearDiscriminantAnalysis(),
        "GPC":GaussianProcessClassifier(),
        
        "LogisticRegression": LogisticRegression(),
        "SVC": SVC(),
        "KNeighborsClassifier": KNeighborsClassifier(),
        
        "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
        "RandomForestClassifier": RandomForestClassifier(random_state=42),
        "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
        "XGB":XGBClassifier()
    }

    # Create an empty DataFrame to store model performance
    model_performance = []

    for name, model in models.items():
        model.fit(X_train_scaled, y_train_encoded)
        predictions = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test_encoded, predictions)
        model_performance.append({
            "Model": name,
            "Accuracy": accuracy
        })

    # For the Sequential model
    sequential_model = Sequential()
    sequential_model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
    sequential_model.add(Dense(32, activation='relu'))
    sequential_model.add(Dense(1, activation='sigmoid'))
    sequential_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    sequential_model.fit(X_train_scaled, y_train_encoded, epochs=50, batch_size=10, verbose=0)
    loss, accuracy = sequential_model.evaluate(X_test_scaled, y_test_encoded)
    model_performance.append({
        "Model": "Sequential",
        "Accuracy": accuracy
    })

    # Convert the model_performance to a DataFrame
    performance_df = pd.DataFrame(model_performance)
    return performance_df.sort_values(by="Accuracy", ascending=False)


def get_selected_models(names):
  """
  Returns selected models for ML processing

  Args:
      names (_type_):List

  Returns:
      List of models
  """
  models = {
    "LDA": LinearDiscriminantAnalysis(),
    "GPC": GaussianProcessClassifier(),
    "GNB": GaussianNB(),
    "SVC": SVC(),
    "LR":LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "DTC": DecisionTreeClassifier(),
    "GBC":GradientBoostingClassifier(),
    "RFC":RandomForestClassifier(),
    "XGB": XGBClassifier()
  }
  
  return [models[model] for model in names]

def evaluate_model(X, y, model, metric):
  # define evaluation procedure
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  # evaluate model
  scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
  return scores

def labels_to_probabilities(y):
    values, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return probabilities

def calculate_entropy(df:pd.DataFrame)-> pd.DataFrame:
  """_summary_

  Args:
      df (pd.DataFrame): Pandas DataFrame

  Returns:
      pd.DataFrame: THe Entropy level of all models
  """

  column_entropy_info = {}
  for col in df.columns:
    probabilities = labels_to_probabilities(df[col])
    entropy_value = entropy(probabilities, base=2)
    column_entropy_info[col] = {
          'entropy': entropy_value
      }

  return pd.DataFrame(column_entropy_info)

# EDA Analysis

# Read in a subset of the data and try to match the class imbalance in the original dataset.

In [13]:
import pandas as pd
import numpy as np

total_samples = 10000
ratio = 258  


positives_needed = total_samples // (ratio + 1)
negatives_needed = total_samples - positives_needed


positives_count = 0
negatives_count = 0


sampled_data = []


chunk_size = 10000

# Randomly sample from each chunk
for chunk in pd.read_csv('data.csv', chunksize=chunk_size):
    # Separate positive and negative cases
    positives = chunk[chunk['is_fraud'] == 1]
    negatives = chunk[chunk['is_fraud'] == 0]

    positives_sample = positives.sample(min(len(positives), positives_needed - positives_count))
    negatives_sample = negatives.sample(min(len(negatives), negatives_needed - negatives_count))

    positives_count += len(positives_sample)
    negatives_count += len(negatives_sample)

    sampled_data.append(positives_sample)
    sampled_data.append(negatives_sample)

    # Break if we have enough samples
    if positives_count >= positives_needed and negatives_count >= negatives_needed:
        break

# Concatenate all sampled data into a single DataFrame
final_sample = pd.concat(sampled_data, ignore_index=True)

# Shuffle the final sample to mix positive and negative cases
df = final_sample.sample(frac=1).reset_index(drop=True)

In [14]:
df.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [15]:
df.duplicated().sum()

0

In [16]:
target = df.values[:,-1]

counter = Counter(target)

for k, v in counter.items():
  per = v/len(target) * 100
  print("Class=%d, Count=%d, Percentage=%.3f%%" % (k,v,per))

Class=0, Count=9962, Percentage=99.620%
Class=1, Count=38, Percentage=0.380%


# Important lesson for ML engineers, the class balance is 258:1 for the original dataset, in our sample it is 262:1. This means For every 258 examples of the majoirty class, there is only one from the minority class. Posting a 99% accuracy score shows a misunderstanding of the problem at hand. Because there are so few fraud examples, the model will not be able to predict the minority class which is costly to the business.

## Business Problem:

### Fraud cases are costly to banks, false negatives will be far more costly to their bottome line than false positives.

* Accuracy and F1 score's are irrelevant in this instance
* F1 score's and accuracy scores do not penalize false negatives
* F2 score's are what should be measured since they give greater weight to false positives.

In [17]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [None]:
df.drop("Unnamed: 0", inplace=True, axis=1)

In [30]:
df["gender"] = df["gender"].apply(lambda x: 0 if x == "F" else 1)

df["trans_date_trans_time"] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M', errors='coerce')

In [18]:
continuous_features = df.select_dtypes(include=["float64","int64"])
categorical_features = df.select_dtypes(include="object")

In [31]:
df

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,23/06/2020 16:19,4.471570e+12,fraud_Bednar Inc,travel,7.68,Dakota,Maldonado,1,369 Cochran Radial,Pelham,...,36.4899,-79.4736,3402,Insurance underwriter,24/10/1927,3b01a118df505bc2bb4875f343bd7e04,1372004387,37.249198,-79.646847,0
1,22/06/2020 15:28,2.131910e+14,fraud_Stiedemann Inc,misc_pos,2.02,Thomas,Sullivan,1,464 Newman Crossroad,Milwaukee,...,42.9676,-88.0434,817312,"Accountant, chartered public finance",18/03/2004,63fd118485a44a98b71536b7881a2d05,1371914904,42.608918,-88.576507,0
2,22/06/2020 05:59,2.131550e+14,"fraud_Langworth, Boehm and Gulgowski",shopping_net,83.65,Christopher,Sheppard,1,39218 Baker Shoals,Bristow,...,38.1981,-86.6821,965,Horticultural therapist,10/02/1982,a82c427805a1422d16d0a073ceb59a6a,1371880785,38.555339,-87.097891,0
3,22/06/2020 15:20,3.557440e+15,fraud_Homenick LLC,personal_care,4.49,Angela,Edwards,0,486 Wilkins Pines,Lithopolis,...,39.8013,-82.8125,460,"Lecturer, higher education",19/06/2004,d39334f392356facced64abef5261bb5,1371914426,40.408631,-83.006333,0
4,23/06/2020 04:07,2.131260e+14,fraud_Mayert Group,shopping_pos,5.05,Adam,Kirk,1,40847 Stark Junctions,Big Indian,...,42.0740,-74.4530,397,Psychiatrist,12/09/1931,a4fdbe2bc80040e9c4e362a38f0d7c38,1371960446,42.984988,-73.585238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,21/06/2020 19:53,4.158950e+15,fraud_Graham and Sons,health_fitness,67.60,Justin,Bell,1,5323 Walker Island,Pittsburgh,...,40.4308,-79.9205,687276,"Scientist, marine",19/10/1973,70dc16ae615907de38e707cfa4f4a055,1371844381,40.944663,-80.589163,0
9996,22/06/2020 17:52,2.280870e+15,fraud_Schiller Ltd,personal_care,133.54,Katherine,Cooper,0,3854 Lauren Springs Suite 648,Oakford,...,40.0994,-89.9601,530,Transport planner,23/09/1967,3934214a2dbb60658d6983e8782f2867,1371923520,40.473806,-89.996660,0
9997,23/06/2020 04:43,4.079770e+12,"fraud_Kihn, Abernathy and Douglas",shopping_net,303.70,Eric,Preston,1,7020 Doyle Stream Apt. 951,Mesa,...,44.6255,-116.4493,129,Cartographer,15/12/1965,631b0a3824ade68c9b3b03c6ae0c7109,1371962594,43.829699,-115.465849,0
9998,22/06/2020 06:43,6.390770e+11,"fraud_Olson, Becker and Koch",gas_transport,69.94,Justin,Carter,1,853 Miller Bypass Suite 802,Broomfield,...,39.8854,-105.1139,92337,"Engineer, electronics",18/10/1967,e46d2832b94c793adc89c64753755358,1371883417,40.471440,-105.890639,0


In [27]:
for col in categorical_features.columns:
  unique_values = categorical_features[col].unique()
  print(unique_values)

['23/06/2020 16:19' '22/06/2020 15:28' '22/06/2020 05:59' ...
 '22/06/2020 11:11' '22/06/2020 09:23' '21/06/2020 17:20']
['fraud_Bednar Inc' 'fraud_Stiedemann Inc'
 'fraud_Langworth, Boehm and Gulgowski' 'fraud_Homenick LLC'
 'fraud_Mayert Group' 'fraud_Cormier LLC' 'fraud_Hauck, Dietrich and Funk'
 'fraud_Botsford PLC' 'fraud_Goyette, Howell and Collier'
 'fraud_Padberg-Rogahn' 'fraud_Swaniawski, Nitzsche and Welch'
 'fraud_Bogisich-Weimann' 'fraud_Watsica, Haag and Considine'
 'fraud_Kulas Group' 'fraud_Okuneva, Schneider and Rau'
 'fraud_Jast-McDermott' 'fraud_Kuhic LLC' 'fraud_Johns Inc'
 'fraud_Denesik, Powlowski and Pouros' 'fraud_Durgan-Auer'
 'fraud_Adams, Kovacek and Kuhlman' 'fraud_Marks Inc'
 'fraud_Abbott-Steuber' 'fraud_Gislason Group' 'fraud_Gleason and Sons'
 'fraud_Eichmann, Bogan and Rodriguez' 'fraud_Bahringer Group'
 'fraud_Friesen-Stamm' 'fraud_Prohaska-Murray'
 'fraud_Hintz, Bauch and Smith' 'fraud_Breitenberg-Hermiston'
 'fraud_Torp-Labadie' 'fraud_Schuppe-Schuppe

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
226,10167,24/06/2020 02:56,180099000000000.0,fraud_Murray-Smitham,grocery_pos,287.38,Dennis,Davidson,M,08469 Trujillo Forge,...,42.52,-78.6847,7728,"Scientist, research (maths)",30/03/1959,a623c5c05b514e3d56ee333bd5565adc,1372042610,42.012741,-79.204228,1
361,1781,21/06/2020 22:37,6564460000000000.0,fraud_Nienow PLC,entertainment,620.33,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,42.5545,-90.3508,1306,Public relations officer,10/09/1958,47a9987ae81d99f7832a54b29a77bf4b,1371854247,42.771834,-90.158365,1
1084,18142,27/06/2020 19:47,3573390000000000.0,fraud_Volkman Ltd,misc_net,667.17,Stephanie,Murphy,F,526 Stacy Walks,...,45.7205,-98.5534,63,Systems developer,30/10/1969,276495f58de3af46843647fdd00843eb,1372362479,46.0403,-97.71046,1
1113,2495,22/06/2020 03:48,3524570000000000.0,fraud_Skiles-Ankunding,grocery_net,13.25,Ashley,Cabrera,F,94225 Smith Springs Apt. 617,...,27.633,-80.4031,105638,"Librarian, public",07/05/1986,d40a602f3d62fc3849bacd25c413952b,1371872900,27.874482,-80.381534,1
1230,11780,24/06/2020 23:15,180099000000000.0,"fraud_Baumbach, Hodkiewicz and Walsh",shopping_pos,741.77,Dennis,Davidson,M,08469 Trujillo Forge,...,42.52,-78.6847,7728,"Scientist, research (maths)",30/03/1959,b971b65d32c1d9b36223370bb5744a94,1372115728,43.225233,-77.814218,1
1359,14678,26/06/2020 09:31,6544730000000000.0,fraud_Kunze Inc,grocery_pos,309.04,John,Myers,M,701 Wilson Divide,...,41.55,-87.4569,23727,Community arts worker,08/02/1982,442a3b75f70a884b86275f9db67f17a5,1372239093,40.821649,-86.546862,1
1436,1784,21/06/2020 22:38,4005680000000000.0,"fraud_Heathcote, Yost and Kertzmann",shopping_net,1077.69,William,Perry,M,458 Phillips Island Apt. 768,...,30.459,-90.9027,71335,Herbalist,31/05/1994,fe956c7e4a253c437c18918bf96f7b62,1371854335,31.204974,-90.261595,1
1636,16231,26/06/2020 23:38,6544730000000000.0,fraud_Kerluke-Abshire,shopping_net,1025.61,John,Myers,M,701 Wilson Divide,...,41.55,-87.4569,23727,Community arts worker,08/02/1982,92cb4901841afb9c374df14d36dfac38,1372289890,41.802631,-88.389592,1
1697,14064,26/06/2020 01:51,213161000000000.0,fraud_Kilback LLC,grocery_pos,315.72,Alyssa,Morgan,F,622 Robin Run Suite 764,...,34.048,-85.9246,67082,Physiological scientist,09/02/1963,79f7c82fe50e90466309135ad1b793d9,1372211515,34.843287,-85.800046,1
1905,2026,21/06/2020 23:59,6564460000000000.0,fraud_Lemke-Gutmann,shopping_net,955.16,Douglas,Willis,M,619 Jeremy Garden Apt. 681,...,42.5545,-90.3508,1306,Public relations officer,10/09/1958,1799ffe421a2a05dc87903e379140f55,1371859145,43.027879,-90.493768,1
