In [2]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

### Reading csv file and getting the dataframe

In [3]:
df = pd.read_csv('../csvs/fraudTrain.csv')
df

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,01/01/2019 00:00,2.703190e+15,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",09/03/1988,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,01/01/2019 00:00,6.304230e+11,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,21/06/1978,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,01/01/2019 00:00,3.885950e+13,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.2620,4154,Nature conservation officer,19/01/1962,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,01/01/2019 00:01,3.534090e+15,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,12/01/1967,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,01/01/2019 00:03,3.755340e+14,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,28/03/1986,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1048570,10/03/2020 16:07,6.011980e+15,fraud_Fadel Inc,health_fitness,77.00,Haley,Wagner,F,05561 Farrell Crescent,...,39.0305,-76.5515,92106,"Accountant, chartered certified",28/05/1943,45ecd198c65e81e597db22e8d2ef7361,1362931649,38.779464,-76.317042,0
1048571,1048571,10/03/2020 16:07,4.839040e+15,"fraud_Cremin, Hamill and Reichel",misc_pos,116.94,Meredith,Campbell,F,043 Hanson Turnpike,...,41.1826,-92.3097,1583,Geochemist,28/06/1999,c00ce51c6ebb7657474a77b9e0b51f34,1362931670,41.400318,-92.726724,0
1048572,1048572,10/03/2020 16:08,5.718440e+11,"fraud_O'Connell, Botsford and Hand",home,21.27,Susan,Mills,F,005 Cody Estates,...,38.2507,-85.7476,736284,Engineering geologist,02/04/1952,17c9dc8b2a6449ca2473726346e58e6c,1362931711,37.293339,-84.798122,0
1048573,1048573,10/03/2020 16:08,4.646850e+18,fraud_Thompson-Gleason,health_fitness,9.52,Julia,Bell,F,576 House Crossroad,...,40.7320,-73.1000,4056,Film/video editor,25/06/1990,5ca650881b48a6a38754f841c23b77ab,1362931718,39.773077,-72.213209,0


In [None]:
df['is_fraud'].value_counts()

In [None]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M', errors='coerce')


### Creating balanced dataframe (equal fraud and non-fraud)

In [None]:
fraud_trans = df[df['is_fraud'] == 1]
non_fraud_trans = df[df['is_fraud'] == 0]

len_fraud = len(fraud_trans)
# new_non_fraud = non_fraud_trans.iloc[:len_fraud]
rand_non_fraud = non_fraud_trans.sample(n=len_fraud, random_state=42)

balanced_df = pd.concat([fraud_trans, rand_non_fraud])
balanced_df

In [None]:
balanced_df = balanced_df.sort_values('unix_time').reset_index(drop=True)
balanced_df

In [None]:
balanced_df['is_fraud'].value_counts()

### Analysing data types of balanced_df

In [None]:
balanced_df.dtypes

In [None]:
# dataframe of only non_integers / floats. will manipulate this to decide how to encode actual df
non_objects = [x for x in balanced_df if balanced_df[x].dtype != object]
print(non_objects)
object_df = balanced_df.drop(non_objects, axis=1)
object_df

In [None]:
print(object_df.columns)

### Box plots of streets
These are two arrays containing the number of times a street name has been repeated. Found out there's not really anything to glean from this

In [None]:
fraud_trans[['street', 'is_fraud']].value_counts()
fraud_trans['street'].value_counts()

In [None]:
rand_non_fraud[['street', 'is_fraud']].value_counts()
rand_non_fraud['street'].value_counts()

In [None]:
fraud_streets = np.array(fraud_trans['street'].value_counts())

In [None]:
fig = plt.figure(figsize =(15, 15))

plt.boxplot(fraud_streets)
plt.show()

In [None]:
non_fraud_streets = np.array(non_fraud_trans['street'].value_counts())
plt.boxplot(non_fraud_streets)
plt.show()

# note: cannot compare these two boxplots, there are way too many non-fraudulent transactions. But this is better than using new_non_fraud, which is self-made

### Analysing purpose of transaction
Might try bar charts? Or boxplots?
Have concluded from a bar chart that more fraud is under misc_net or grocery_pos.

However, this is compared with a randomly sampled sample of non-fraud data points: is this ok?

In [None]:
non_fraud_trans['category'].value_counts()
# rand_non_fraud['category'].value_counts()

In [None]:
fraud_trans['category'].value_counts()
# was right about high number of online fraud transactions (_net). Didn't expect the highest to be grocery_pos (is this why I get stopped at the checkout so much?) did not expect low number of grocery_net

In [None]:
categories = df['category'].unique().tolist()

non_fraud_cats = np.array([ (x / len(non_fraud_trans)) * 100 for x in non_fraud_trans['category'].value_counts()])
fraud_cats = np.array([ (x / len(fraud_trans)) * 100 for x in fraud_trans['category'].value_counts()])

x_axis = np.arange(len(categories))

fig = plt.figure(figsize=(20, 7))
plt.bar(x_axis - 0.2, non_fraud_cats, 0.4, label='Non-fraud', color='b')
plt.bar(x_axis + 0.2, fraud_cats, 0.4, label='Fraud', color='r')

plt.xticks(x_axis, categories)
plt.xlabel("Purpose of transaction") 
plt.ylabel("Percentage") 
plt.title("Percentages of transactions for different purposes") 
plt.legend() 
plt.show()

### Analysing amount spent

In [None]:
max(non_fraud_trans['amt'])

In [None]:
max(fraud_trans['amt'])

In [None]:
non_fraud_amounts = non_fraud_trans['amt'].tolist()
plt.boxplot(non_fraud_amounts)
plt.show()

In [None]:
fraud_amounts = fraud_trans['amt'].tolist()
non_fraud_amounts = non_fraud_trans['amt'].tolist()
amounts = [fraud_amounts, non_fraud_amounts]
plt.boxplot(amounts, tick_labels=['fraud', 'non-fraud'])
plt.show()

### Random analysis
Includes: analysing jobs, attempt at feature engineering

In [None]:
fraud_trans['job'].unique().tolist()

In [None]:
non_fraud_trans['job'].unique().tolist()

In [None]:
fraud_jobs = fraud_trans['job'].unique().tolist()
non_fraud_jobs = non_fraud_trans['job'].unique().tolist()

both_jobs = []

for job in fraud_jobs:
    if job in non_fraud_jobs:
        both_jobs.append(job)

both_jobs

In [None]:
rand_non_fraud['cc_num'].value_counts()

In [None]:
fig = px.histogram(balanced_df, x='amt', color='is_fraud', marginal='box', 
                   title='Transaction Amount Distribution (Fraud vs Non-Fraud)', 
                   color_discrete_map={0: 'blue', 1: 'red'})
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.box(balanced_df, x='category', y='amt', color='is_fraud', 
             title='Transaction Amount by Category (Fraud vs Non-Fraud)', 
             color_discrete_map={0: 'blue', 1: 'red'})
fig.show()


In [1]:
balanced_df['hour'] = balanced_df['trans_date_trans_time'].dt.hour
fig = px.histogram(balanced_df, x='hour', color='is_fraud', marginal='box', 
                   title='Fraud Transactions by Hour of the Day')
fig.update_layout(bargap=0.1)
fig.show()

NameError: name 'balanced_df' is not defined

In [None]:
fig = px.box(balanced_df, 
    x='is_fraud', 
    y='amt', 
    color='is_fraud', 
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Transaction Amount by Fraud Status',
    labels={'is_fraud': 'Fraud Status', 'amt': 'Transaction Amount'})
fig.show()

In [None]:
fig = px.line(amount_over_time, x='date', y='amt', color='is_fraud',
              title='Amount Spent Over Time (Fraud vs Non-Fraud)',
              labels={'is_fraud': 'Fraud Status', 'amt': 'Amount Spent'})
fig.show()

In [None]:

fig = px.scatter_geo(balanced_df, lat='lat', lon='long', color='is_fraud', hover_name='cc_num',
                     color_continuous_scale='RdBu', title='Fraud vs Non-Fraud by Latitude and Longitude')
fig.update_geos(showcoastlines=True, coastlinecolor='Black', showland=True, landcolor='lightgray')
fig.update_layout(title='Fraud vs Non-Fraud by Latitude and Longitude', geo=dict(showland=True))
fig.show()
