In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import RobustScaler# For outliers



In [2]:
df = pd.read_csv("data/all_transactions.csv")


In [3]:
df = df.drop(['ea_cardholderpresence', 'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country', 'created_date'], axis=1)

In [4]:
df=df[df.transactions_state == 'COMPLETED']

In [5]:
#transactions_type,transactions_currency,amount_usd,directions

In [6]:
df.head()

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,direction,user_id
0,transaction_0,TRANSFER,AED,4.55,COMPLETED,OUTBOUND,user_4368
1,transaction_1,CARD_PAYMENT,AED,15.5,COMPLETED,OUTBOUND,user_2355
2,transaction_2,CARD_PAYMENT,AED,43.4,COMPLETED,OUTBOUND,user_2355
3,transaction_3,TRANSFER,AED,10043.01,COMPLETED,OUTBOUND,user_2355
4,transaction_4,CARD_PAYMENT,AED,43.81,COMPLETED,OUTBOUND,user_14318


In [7]:
currency_ = {'AED' : 'Fiat', 'SEK' : 'Fiat', 'AUD' : 'Fiat', 'GBP' : 'Fiat', 'ETH' : 'Crypto', 'RUB' : 'Fiat', 'CHF' : 'Fiat', 'HRK' : 'Fiat', 'LTC' : 'Crypto', 'MAD' : 'Fiat', 'BTC' : 'Crypto', 'NZD' : 'Fiat', 'JPY' : 'Fiat', 'ILS' : 'Fiat', 'QAR' : 'Fiat', 'MXN' : 'Fiat', 'DKK' : 'Fiat', 'SGD' : 'Fiat', 'ZAR' : 'Fiat', 'BGN' : 'Fiat', 'USD' : 'Fiat', 'INR' : 'Fiat', 'THB' : 'Fiat', 'RON' : 'Fiat', 'HUF' : 'Fiat', 'TRY' : 'Fiat', 'XRP' : 'Crypto', 'PLN' : 'Fiat', 'EUR' : 'Fiat', 'BCH' : 'Crypto', 'CZK' : 'Fiat', 'CAD' : 'Fiat', 'NOK' : 'Fiat', 'HKD' : 'Fiat', 'SAR' : 'Fiat'}
df['transactions_currency'] = df['transactions_currency'].map(currency_)

# Prwta one hot encoding  kai meta na kanw sum ta mhdenika
#sum zeros
#sum 1

In [8]:
df = df.drop(['transactions_state','transaction_id'], axis=1)


In [9]:
#one hot encoding
dum = pd.get_dummies(df[['transactions_type','transactions_currency','direction']])
df = pd.concat([df, dum], axis=1)
df = df.drop(columns=['transactions_type', 'transactions_currency','direction'])
df.head()

Unnamed: 0,amount_usd,user_id,transactions_type_ATM,transactions_type_CARD_PAYMENT,transactions_type_CARD_REFUND,transactions_type_CASHBACK,transactions_type_EXCHANGE,transactions_type_FEE,transactions_type_REFUND,transactions_type_TAX,transactions_type_TOPUP,transactions_type_TRANSFER,transactions_currency_Crypto,transactions_currency_Fiat,direction_INBOUND,direction_OUTBOUND
0,4.55,user_4368,0,0,0,0,0,0,0,0,0,1,0,1,0,1
1,15.5,user_2355,0,1,0,0,0,0,0,0,0,0,0,1,0,1
2,43.4,user_2355,0,1,0,0,0,0,0,0,0,0,0,1,0,1
3,10043.01,user_2355,0,0,0,0,0,0,0,0,0,1,0,1,0,1
4,43.81,user_14318,0,1,0,0,0,0,0,0,0,0,0,1,0,1


In [10]:
#users_agg_df = df.groupby('user_id').agg({'transactions_currency_Fiat': 'sum','transactions_currency_Crypto':'sum'})
users_agg_df=df.groupby(['user_id']).agg({'transactions_currency_Crypto': "sum",'transactions_currency_Fiat':'sum',
                                          'direction_INBOUND':'sum','direction_OUTBOUND':'sum',
                                          'amount_usd':'mean',
                                        'transactions_type_ATM':'sum','transactions_type_CARD_PAYMENT':'sum','transactions_type_CARD_REFUND':'sum','transactions_type_CASHBACK':'sum','transactions_type_EXCHANGE':'sum','transactions_type_FEE':'sum',
                                        'transactions_type_REFUND':'sum','transactions_type_TAX':'sum','transactions_type_TOPUP':'sum','transactions_type_TRANSFER':'sum'}).reset_index()


#fiat = df.groupby(["user_id"]).agg({'transactions_currency_Fiat': 'sum'})
#crypto = df.groupby(["user_id"]).agg({'transactions_currency_Crypto': 'sum'})

In [11]:
users_agg_df.head()

Unnamed: 0,user_id,transactions_currency_Crypto,transactions_currency_Fiat,direction_INBOUND,direction_OUTBOUND,amount_usd,transactions_type_ATM,transactions_type_CARD_PAYMENT,transactions_type_CARD_REFUND,transactions_type_CASHBACK,transactions_type_EXCHANGE,transactions_type_FEE,transactions_type_REFUND,transactions_type_TAX,transactions_type_TOPUP,transactions_type_TRANSFER
0,user_0,0.0,520.0,163.0,357.0,13.317462,0,333.0,1,0.0,5.0,0,0,0,158.0,23.0
1,user_1,0.0,133.0,27.0,106.0,97.410075,4,78.0,0,0.0,0.0,0,0,0,24.0,27.0
2,user_10,1.0,122.0,52.0,71.0,44.218862,3,42.0,9,0.0,3.0,0,0,0,43.0,23.0
3,user_100,0.0,55.0,11.0,44.0,76.910909,6,38.0,0,0.0,0.0,0,0,0,11.0,0.0
4,user_1000,0.0,43.0,23.0,20.0,19.607442,1,16.0,0,16.0,0.0,2,0,0,7.0,1.0


In [12]:
cluster_set = df.drop(['user_id'], axis=1)


In [13]:
from sklearn import preprocessing

In [14]:
X = preprocessing.normalize(cluster_set)

In [17]:
#                   CODE:                                      # -------------------------------------------
cluster_scores = []
for k in range(2, 10):
    km = KMeans(k, random_state=77)
    km.fit(X)
    cluster_scores.append(km.inertia_)
#                  PLOTTING:                                   #-------------------------------------------- 
# Create figure fig = plt.figure(figsize=(7, 5)) ax = plt.subplot(111)
# Plot total inertia for all values of k ax.plot(range(2, 101), cluster_scores)
# Aesthetic parameters ax.set_xlabel('k') ax.set_ylabel('inertia')
ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom')
ax.set_title('Inertia for different values of k')


SyntaxError: invalid syntax (<ipython-input-17-ef427277d412>, line 11)

In [15]:
#cluster_scaled = RobustScaler().fit_transform(cluster_set)

In [None]:
matrix = cluster_set
for k in range(2, 5):
    print("------------------------------------------")
    kmeans = KMeans(n_clusters=k,random_state=23,n_jobs=-1)
    kmeans.fit(matrix)
    clusters = kmeans.predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters)
    print("For k =", k, "The average silhouette_score is :", silhouette_avg)

In [None]:
#inertia ---> Sum of squared distances of samples to their closest cluster center.
distortions = []
K = range(1, 10)
for k in K:
    print("------------------------------------------")
    kmeansModel = KMeans(n_clusters = k, n_init=30, random_state = 23,n_jobs=-1)
    kmeansModel.fit(cluster_scaled)
    distortions.append(kmeansModel.inertia_)
    
plt.plot(K, distortions)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("Residual sum of Squares")
plt.show()