In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import dask.dataframe as dd
from geopy.distance import geodesic
import string

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
dir = '../csv/'
fname = 'credit_card_fraud_sample_2019_Dec_100K'

In [None]:
df_eda = pd.read_pickle(fname + '_prepared.pkl')
df_eda.head()

In [None]:
df_eda.info()

*Functions*

In [None]:
def plot_categorical(df_in, col, target='is_fraud', top_n=10):
    """
    Plot a bar chart of the counts of the top_n categories in a column.
    """
    # Get the top_n categories
    top_n_cats = df_in[col].value_counts().head(top_n).index
    # Create a bar plot
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_in, y=col, order=top_n_cats, hue=target)
    plt.title(f'Count of {col} with Fraud')
    plt.show()

def plot_numerical(df_in, col, target='is_fraud', bins=30):
    """
    Plot a histogram of a numerical column.
    """
    # Create a histogram    
    plt.figure(figsize=(10, 6))
    sns.histplot(df_in, x=col, bins=bins, hue=target, kde=True)
    plt.title(f'{col} Distribution with Fraud')
    plt.show()

def plot_categorical_percent(df_in, col, target='is_fraud', top_n=10):
    """
    Plot a bar chart of the percentage of the top_n categories in a column.
    """
    # Get the top_n categories  
    top_n_cats = df_in[col].value_counts().head(top_n).index
    # Create a bar plot
    plt.figure(figsize=(10, 6))
    (df_in[col].value_counts() / len(df_in)).head(top_n).plot(kind='bar')
    plt.title(f'Percentage of {col}')
    plt.show()

def plot_numerical_percent(df_in, col, target='is_fraud', bins=30):
    """
    Plot a histogram of a numerical column.
    """
    # Create a histogram
    plt.figure(figsize=(10, 6))
    df_in[col].plot(kind='hist', bins=bins)

    plt.title(f'{col} Distribution')
    plt.show()



***Descriptive***

In [None]:
df_eda.describe()

In [None]:
df_eda['is_fraud'].value_counts()

In [None]:
df_eda.columns

In [None]:
cols_of_interest = ['cc_num', 'gender', 'state', 'zip', 'city_pop', 'dob', 
       'trans_num', 'trans_date', 'trans_time', 'unix_time',
       'category', 'amt', 'is_fraud', 'region', 'trans_time_segment', 'age_group', 'cc_type', 'area_cat']

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

In [None]:
cols_of_interest = ['gender', 'city_pop', 'dob', 'trans_date', 'trans_time', 
                    'unix_time', 'category', 'amt', 'is_fraud', 'region', 
                    'trans_time_segment', 'age_group', 'cc_type', 'area_cat']

df_subset = df_eda[cols_of_interest]
df_subset.to_csv(fname + '_subset.csv', index=False)
df_subset.head()

In [None]:
## NOTE: Some faature engineering needs to be done before running AutoViz
# AutoViz for automated visualizations
AV = AutoViz_Class()
# Generate automated visualizations
dft = AV.AutoViz(
    filename = fname + '_subset.csv', 
    #dfte=df_subset,
    #depVar ='is_fraud',  # Target variable
    #chart_format = 'matplotlib',
    #max_rows_analyzed=10000,  # Limit for performance
    #max_cols_analyzed=15    
)

In [None]:
#print(df_eda['distance'].value_counts())
print(df_eda['distance'].describe())
print('skewness', df_eda['distance'].skew()) # -0.24, largely symmetrical distribution with a slight pull to the left

# Display the distribution of distances 
plt.figure(figsize=(10, 6))
sns.histplot(df_eda['distance'], bins=30, kde=True)

distance_bins = [0, 20, 50, 100, 150]  # Ranges for categorization
distance_labels = ["local_0-20_km", "close_21-50_km", "moderate_51-100_km", "far_101-150_km"]

# Create a new column with distance categories
df_eda['distance_category'] = pd.cut(df_eda['distance'], bins=distance_bins, labels=distance_labels, include_lowest=True)

#df_eda.head()