In [None]:
import json
from urllib.request import urlopen

import pandas as pd
import numpy as np
import polars as pl

from sklearn.preprocessing import LabelEncoder , StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# read the transaction CSV
df = pl.read_csv("transactions_data.csv")

In [None]:
# read the json file containing mcc codes
with open("mcc_codes.json" , "r") as f:
    json_data = json.load(f)

json_list = [[] , []]
for key, value in json_data.items():
    json_list[0].append(key)
    json_list[1].append(value)

df_mcc_codes = pl.DataFrame({"mcc" : json_list[0] , "merchant_category" : json_list[1]} , schema={"mcc" : pl.Int64 , "merchant_category" : pl.String} , strict=False)

In [None]:
# construct the final dataframe by joining the mcc_codes to it
df = df.join(other=df_mcc_codes , on="mcc", how="left")

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.glimpse(max_items_per_column=10)

In [None]:
df.select(pl.all().n_unique())

In [None]:
# This can be used for a visualization of different amounts of transactions
df_chip_usage = df.get_column("use_chip").value_counts()
df.get_column("use_chip").value_counts()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
## Data Cleaning

In [None]:
### Add the names of the States

In [None]:
### Set data type of attribute "date"

In [None]:
df = df.with_columns(
    pl.col("date").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S").alias("date")
)

In [None]:
df = df.with_columns(
    pl.col("date").dt.date().alias("date")
)

In [None]:
### Cast attribute zip to integer

In [None]:
df = df.cast({"zip":pl.Int64})

In [None]:
### Split amount into Debit and Credit

In [None]:
df = df.with_columns(
    pl.col("amount").str.replace_all("$","" , literal=True).alias("amount")
)

In [None]:
df = df.cast({"amount" : pl.Float32})

In [None]:
# split up "amount", clean it from nulls, remove decimals and rename the new columns
df = df.with_columns(
    pl.col("amount").round(0).alias("amount")
    ,pl.when(pl.col("amount") < 0).then(pl.col("amount")*-1).fill_null(0).round(0).alias("debit")
    ,pl.when(pl.col("amount") >= 0).then(pl.col("amount")).fill_null(0).round(0).alias("credit")
    , pl.when(pl.col("amount") < 0).then(0).otherwise(1).fill_null(0).alias("in_out_flag")
)

In [None]:
# filter out the transactions that now have amount 0
amount_of_0_transactions = df.select("amount").filter(pl.col("amount")==0).shape[0]
amount_of_total_transactions = df.shape[0]
print("Percentage of transactions lost due to removing decimals: " + str(round(amount_of_0_transactions / amount_of_total_transactions * 100 , 2) ) + '%')

In [None]:
df = df.filter(pl.col("amount") != 0)

In [None]:
# Cast "amount" , "debit" and "credit" to 64 Bit Integer
df = df.cast({"amount" : pl.Int64 , "debit" : pl.Int64 , "credit" : pl.Int64})

In [None]:
df.select(pl.col("debit")).filter(pl.col("debit")!= 0)

In [None]:
### Play around with removing nulls

In [None]:
# drop "erro"
df = df.drop("errors")
# drop all entries containing nulls
df = df.drop_nulls()

In [None]:
print("The percentage of entries remaining is: " + str(round( df.shape[0]/13305915*100, 1)) + "%")

In [None]:
# Analysis

In [None]:
## Show Pairplot

In [None]:
sns.pairplot(df.to_pandas(), hue='clusters' , palette='viridis')
plt.show()

In [None]:
### Show Distributions of payments in total

In [None]:
df.select(pl.col("amount").min())

In [None]:
df.select(pl.col("amount").max())

In [None]:
print(df.select(pl.col("amount").std()))

In [None]:
### Calculate Metrics

In [None]:
# Location credit
df_location_credit = df.select(pl.col("credit")).filter(pl.col("credit") != 0)
df_cred_mean = df_location_credit.select(pl.mean("credit").alias("mean_credit"))
df_cred_median = df_location_credit.select(pl.median("credit").alias("median_credit"))
df_cred_mode =  df_location_credit.select(pl.col("credit").mode().alias("mode_credit"))
# Location debit
df_location_debit = df.select(pl.col("debit")).filter(pl.col("debit") != 0)
df_deb_mean = df_location_debit.select(pl.mean("debit").alias("mean_debit"))
df_deb_median = df_location_debit.select(pl.median("debit").alias("median_debit"))
df_deb_mode =  df_location_debit.select(pl.col("debit").mode().alias("mode_debit"))

In [None]:
# concat to new dataframe
df_location = pl.concat([df_cred_mean ,df_deb_mean , df_cred_median , df_deb_median , df_cred_mode , df_deb_mode] , how="horizontal")
print(df_location)

In [None]:
## Distributions of Debit & Credit

In [None]:
#sns.histplot(data=df.filter(pl.col("credit")!=0), x="credit", bins=80)

In [None]:
#sns.histplot(data=df.filter(pl.col("credit")!=0), x="credit", log_scale=True , bins=80)

In [None]:
#sns.histplot(data=df.filter(pl.col("debit")!=0), x="debit", bins=80)

In [None]:
#sns.histplot(data=df.filter(pl.col("debit")!=0), x="debit" , log_scale=True, bins=80)

In [None]:
## Correlation and Covariance

In [None]:
### Feature encoding

In [None]:
df.head()

In [None]:
# Label encoding for the nominal features
enc_label = LabelEncoder()

label_mappings = {}

feat_nominal = ["use_chip" , "merchant_city" , "merchant_state" , "merchant_category"]
df_encode = df
for x in feat_nominal:
    df_encode = df_encode.with_columns(pl.Series(x , enc_label.fit_transform(df_encode[x].to_numpy())))
    label_mappings[x] = dict(zip(enc_label.classes_, range(len(enc_label.classes_))))

In [None]:
print(df_encode.select(["use_chip" , "merchant_id" , "merchant_city" , "merchant_state", "zip" , "merchant_category" ,"debit" , "credit" , "amount"]))

In [None]:
# Standardizing the dataframe
df_scale = df_encode.select(["use_chip" , "merchant_id" , "merchant_city" , "merchant_state", "zip" , "merchant_category" ,"debit" , "credit" , "amount"])
transformer = StandardScaler().fit(df_scale)
transformer

In [None]:
trans_arr = transformer.transform(df_scale)
# Extract the columns of the DataFrame
df_scale_columns = df_scale.columns

# Re-Construct the DataFrame with the scaled information
df_scaled = pl.DataFrame(trans_arr , schema=df_scale_columns)

In [None]:
print(df_scaled)

In [None]:
### Correlation Matrix

In [None]:
cor_mat = df_scaled.select(df_scaled).corr()
plt.figure(figsize=(80, 80), dpi=70) 
ax = sns.heatmap(cor_mat, vmin=-1, vmax=1, annot=True, fmt="f")
plt.show()

In [None]:
print(cor_mat)

In [None]:
# calcualte the Covariance Matric. NOTE!!!!! Here we should use the unstandardized version to really get the covariance but that sucks, is unreadable and doesn't yield any benefit anyways.
# Due to this issue we show this 
covariance_matrix_unscaled = np.cov(df_encode, rowvar=False)  # rowvar=False means columns are features
covariance_matrix = np.cov(df_scaled, rowvar=False)  # rowvar=False means columns are features
# Visualize the Covariance Matrix
plt.figure(figsize=(10, 10), dpi=70)  # Adjust figure size for clarity
ax = sns.heatmap(covariance_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True , xticklabels=df_scaled.columns , yticklabels=df_scaled.columns)
# Add labels and title
plt.title("Covariance Matrix Heatmap")
plt.show()

In [None]:
## Choropleth map of payments per state

In [None]:
# This is is perhaps necessary for later on to map the states
state_full_names = {
    'AL': 'Alabama'
    , 'AK': 'Alaska'
    , 'AZ': 'Arizona'
    , 'AR': 'Arkansas', 
    'CA': 'California'
    , 'CO': 'Colorado'
    , 'CT': 'Connecticut'
    , 'DE': 'Delaware', 
    'DC': 'District of Columbia'
    , 'FL': 'Florida'
    , 'GA': 'Georgia'
    , 'HI': 'Hawaii'
    , 'ID': 'Idaho'
    , 'IL': 'Illinois'
    , 'IN': 'Indiana'
    , 'IA': 'Iowa'
    , 'KS': 'Kansas', 
    'KY': 'Kentucky'
    , 'LA': 'Louisiana'
    , 'ME': 'Maine'
    , 'MD': 'Maryland', 
    'MA': 'Massachusetts'
    , 'MI': 'Michigan'
    , 'MN': 'Minnesota'
    , 'MS': 'Mississippi', 
    'MO': 'Missouri'
    , 'MT': 'Montana'
    , 'NE': 'Nebraska'
    , 'NV': 'Nevada', 
    'NH': 'New Hampshire'
    , 'NJ': 'New Jersey'
    , 'NM': 'New Mexico'
    , 'NY': 'New York', 
    'NC': 'North Carolina'
    , 'ND': 'North Dakota'
    , 'OH': 'Ohio'
    , 'OK': 'Oklahoma', 
    'OR': 'Oregon'
    , 'PA': 'Pennsylvania'
    , 'RI': 'Rhode Island'
    , 'SC': 'South Carolina', 
    'SD': 'South Dakota'
    , 'TN': 'Tennessee'
    , 'TX': 'Texas'
    , 'UT': 'Utah', 
    'VT': 'Vermont'
    , 'VA': 'Virginia'
    , 'WA': 'Washington'
    , 'WV': 'West Virginia', 
    'WI': 'Wisconsin'
    , 'WY': 'Wyoming'
    , 'AA': 'Armed Forces Americas'
}


state_abbreviation_list = []
state_name_list = []
for x in state_full_names:
    state_abbreviation_list.append(x)
    state_name_list.append(state_full_names[x])
    
df_states_junc = pl.DataFrame({"merchant_state" : state_abbreviation_list, "state_name" :state_name_list})
print(df_states_junc)
df = df.join(other=df_states_junc , on="merchant_state", how="left")

In [None]:
# Add the state_ids from the geojson
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
    states_geojson = json.load(response)

# Extract the features from the GeoJSON
features = states_geojson['features']

# Extract `id` and `name` from features
data = [(feature['id'], feature['properties']['name']) for feature in features]
# Create a Polars DataFrame
df_state_ids = pl.DataFrame(data, schema=["state_id", "state_name"])
# Join the ids onto the original dataframe
df = df.join(other=df_state_ids, on="state_name", how='left')

In [None]:
df.head()

In [None]:
df.group_by(pl.col("state_id")).agg(pl.col("amount").sum().alias("amount"))

In [None]:
# Main version for total volume of financial transactions
# Load GeoJSON for US states
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
    states_geojson = json.load(response)

# Aggregate total amount by state
#state_aggregated = df.groupby('merchant_state', as_index=False)['amount'].sum()


state_aggregated = (
    df.group_by(pl.col("state_id")).agg(pl.col("amount").sum().alias("amount"))
)
state_aggregated_count = df.group_by(pl.col("state_id")).agg(pl.len().alias("transaction_count"))


# Create the choropleth map
fig = px.choropleth_map(
    state_aggregated,
    geojson=states_geojson,
    locations='state_id',  # Column containing state codes (e.g., 'CA', 'TX')
    #featureidkey="properties.state",  # GeoJSON property that matches locations
    color='amount',  # Column to visualize
    color_continuous_scale="Viridis",
    range_color=(0, 60000000), #state_aggregated['amount'].max()
    #scope="usa",  # Focus on the United States
    map_style="carto-positron",
    zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
    opacity=0.5,
    labels={'amount': 'Transaction Amount'}
)

# Adjust layout
#fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
#fig.update_traces(locationmode="USA-states")
fig.show()

In [None]:
# This is the version for the counting of financial transactions
# Load GeoJSON for US states
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
    states_geojson = json.load(response)

# Aggregate total amount by state
#state_aggregated = df.groupby('merchant_state', as_index=False)['amount'].sum()



state_aggregated_count = df.group_by(pl.col("state_id")).agg(pl.len().alias("transaction_count"))


# Create the choropleth map
fig = px.choropleth_map(
    state_aggregated_count,
    geojson=states_geojson,
    locations='state_id',  # Column containing state codes (e.g., 'CA', 'TX')
    #featureidkey="properties.state",  # GeoJSON property that matches locations
    color='transaction_count',  # Column to visualize
    color_continuous_scale="Viridis",
    range_color=(state_aggregated_count.select(pl.min("transaction_count")).to_series()[0], state_aggregated_count.select(pl.max("transaction_count")).to_series()[0]), #state_aggregated['amount'].max()
    #scope="usa",  # Focus on the United States
    map_style="carto-positron",
    zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
    opacity=0.5,
    labels={'transaction_count': 'Transaction Count'}
)

# Adjust layout
#fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
#fig.update_traces(locationmode="USA-states")
fig.show()

In [None]:
#!!!!!!!!!!!!!This is the version with the Logarithm to scale everything, but it makes no sense
# Load GeoJSON for US states
with urlopen('https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json') as response:
    states_geojson = json.load(response)

# Aggregate total amount by state
#state_aggregated = df.groupby('merchant_state', as_index=False)['amount'].sum()


state_aggregated = (
    df.group_by(pl.col("state_id")).agg(pl.col("amount").sum().alias("amount"))
)

state_aggregated = state_aggregated.with_columns(
    pl.col("amount").log10().alias("amount_standardized")
)



# Create the choropleth map
fig = px.choropleth_map(
    state_aggregated,
    geojson=states_geojson,
    locations='state_id',  # Column containing state codes (e.g., 'CA', 'TX')
    #featureidkey="properties.state",  # GeoJSON property that matches locations
    color='amount_standardized',  # Column to visualize
    color_continuous_scale="Viridis",
    range_color=(state_aggregated.select(pl.min("amount_standardized")).to_series()[0], state_aggregated.select(pl.max("amount_standardized")).to_series()[0]), #state_aggregated['amount'].max()
    #scope="usa",  # Focus on the United States
    map_style="carto-positron",
    zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
    opacity=0.5,
    labels={'amount': 'Transaction Amount'}
)

# Adjust layout
#fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
#fig.update_traces(locationmode="USA-states")
fig.show()

In [None]:
## Count the payments per day

In [None]:
df_nunique = df.select(pl.all().n_unique())
df_nunique.head()

In [None]:
df_count_per_day = df.with_columns(
    pl.lit(1).alias("counter")
)

In [None]:
df_count_per_day.head()

In [None]:
df_count_per_day = df_count_per_day.with_columns(
    pl.col("date").dt.date().alias("date")
)

In [None]:
df_count_per_day.head()

In [None]:
df_count_per_day.group_by("date").agg(pl.col("counter").sum())

In [None]:
print(df_count_per_day)