In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

### Reading csv file and getting the dataframe

In [None]:
df = pd.read_csv('../csvs/fraudTrain.csv')
df

In [None]:
df['is_fraud'].value_counts()

In [None]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M', errors='coerce')


### Creating balanced dataframe (equal fraud and non-fraud)

In [None]:
fraud_trans = df[df['is_fraud'] == 1]
non_fraud_trans = df[df['is_fraud'] == 0]

len_fraud = len(fraud_trans)
# new_non_fraud = non_fraud_trans.iloc[:len_fraud]
rand_non_fraud = non_fraud_trans.sample(n=len_fraud, random_state=42)

balanced_df = pd.concat([fraud_trans, rand_non_fraud])
balanced_df

In [None]:
balanced_df = balanced_df.sort_values('unix_time').reset_index(drop=True)
balanced_df

In [None]:
balanced_df['is_fraud'].value_counts()

### Analysing data types of balanced_df

In [None]:
balanced_df.dtypes

In [None]:
# dataframe of only non_integers / floats. will manipulate this to decide how to encode actual df
non_objects = [x for x in balanced_df if balanced_df[x].dtype != object]
print(non_objects)
object_df = balanced_df.drop(non_objects, axis=1)
object_df

In [None]:
print(object_df.columns)

### Box plots of streets
These are two arrays containing the number of times a street name has been repeated. Found out there's not really anything to glean from this

In [None]:
fraud_trans[['street', 'is_fraud']].value_counts()
fraud_trans['street'].value_counts()

In [None]:
rand_non_fraud[['street', 'is_fraud']].value_counts()
rand_non_fraud['street'].value_counts()

In [None]:
fraud_streets = np.array(fraud_trans['street'].value_counts())

In [None]:
fig = plt.figure(figsize =(15, 15))

plt.boxplot(fraud_streets)
plt.show()

In [None]:
non_fraud_streets = np.array(non_fraud_trans['street'].value_counts())
plt.boxplot(non_fraud_streets)
plt.show()

# note: cannot compare these two boxplots, there are way too many non-fraudulent transactions. But this is better than using new_non_fraud, which is self-made

### Analysing purpose of transaction
Might try bar charts? Or boxplots?
Have concluded from a bar chart that more fraud is under misc_net or grocery_pos.

However, this is compared with a randomly sampled sample of non-fraud data points: is this ok?

In [None]:
non_fraud_trans['category'].value_counts()
# rand_non_fraud['category'].value_counts()

In [None]:
fraud_trans['category'].value_counts()
# was right about high number of online fraud transactions (_net). Didn't expect the highest to be grocery_pos (is this why I get stopped at the checkout so much?) did not expect low number of grocery_net

In [None]:
categories = df['category'].unique().tolist()

non_fraud_cats = np.array([ (x / len(non_fraud_trans)) * 100 for x in non_fraud_trans['category'].value_counts()])
fraud_cats = np.array([ (x / len(fraud_trans)) * 100 for x in fraud_trans['category'].value_counts()])

x_axis = np.arange(len(categories))

fig = plt.figure(figsize=(20, 7))
plt.bar(x_axis - 0.2, non_fraud_cats, 0.4, label='Non-fraud', color='b')
plt.bar(x_axis + 0.2, fraud_cats, 0.4, label='Fraud', color='r')

plt.xticks(x_axis, categories)
plt.xlabel("Purpose of transaction") 
plt.ylabel("Percentage") 
plt.title("Percentages of transactions for different purposes") 
plt.legend() 
plt.show()

### Analysing amount spent

In [None]:
max(non_fraud_trans['amt'])

In [None]:
max(fraud_trans['amt'])

In [None]:
non_fraud_amounts = non_fraud_trans['amt'].tolist()
plt.boxplot(non_fraud_amounts)
plt.show()

In [None]:
fraud_amounts = fraud_trans['amt'].tolist()
non_fraud_amounts = non_fraud_trans['amt'].tolist()
amounts = [fraud_amounts, non_fraud_amounts]
plt.boxplot(amounts, tick_labels=['fraud', 'non-fraud'])
plt.show()

### Random analysis
Includes: analysing jobs, attempt at feature engineering

In [None]:
fraud_trans['job'].unique().tolist()

In [None]:
non_fraud_trans['job'].unique().tolist()

In [None]:
fraud_jobs = fraud_trans['job'].unique().tolist()
non_fraud_jobs = non_fraud_trans['job'].unique().tolist()

both_jobs = []

for job in fraud_jobs:
    if job in non_fraud_jobs:
        both_jobs.append(job)

both_jobs

In [None]:
rand_non_fraud['cc_num'].value_counts()

In [None]:
fig = px.histogram(balanced_df, x='amt', color='is_fraud', marginal='box', 
                   title='Transaction Amount Distribution (Fraud vs Non-Fraud)', 
                   color_discrete_map={0: 'blue', 1: 'red'})
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.box(balanced_df, x='category', y='amt', color='is_fraud', 
             title='Transaction Amount by Category (Fraud vs Non-Fraud)', 
             color_discrete_map={0: 'blue', 1: 'red'})
fig.show()


In [None]:
balanced_df['hour'] = balanced_df['trans_date_trans_time'].dt.hour
fig = px.histogram(balanced_df, x='hour', color='is_fraud', marginal='box', 
                   title='Fraud Transactions by Hour of the Day')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.box(balanced_df, 
    x='is_fraud', 
    y='amt', 
    color='is_fraud', 
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Transaction Amount by Fraud Status',
    labels={'is_fraud': 'Fraud Status', 'amt': 'Transaction Amount'})
fig.show()

In [None]:
fig = px.line(amount_over_time, x='date', y='amt', color='is_fraud',
              title='Amount Spent Over Time (Fraud vs Non-Fraud)',
              labels={'is_fraud': 'Fraud Status', 'amt': 'Amount Spent'})
fig.show()

In [None]:

fig = px.scatter_geo(balanced_df, lat='lat', lon='long', color='is_fraud', hover_name='cc_num',
                     color_continuous_scale='RdBu', title='Fraud vs Non-Fraud by Latitude and Longitude')
fig.update_geos(showcoastlines=True, coastlinecolor='Black', showland=True, landcolor='lightgray')
fig.update_layout(title='Fraud vs Non-Fraud by Latitude and Longitude', geo=dict(showland=True))
fig.show()
