**Merging transactions**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
'''Only to be used if "all_tranformations.csv" does not exist.'''
# df = pd.read_csv("data/transactions_1.csv")
# df2 = pd.read_csv("data/transactions_2.csv")
# df3 = pd.read_csv("data/transactions_3.csv")

# #combining datasets in one CSV
# transactions=pd.concat([df,df2,df3], ignore_index = True)

# #saving to directory
# transactions.to_csv(r'data/all_transactions.csv', index = False)

**Merging datagrames**

In [None]:
df = pd.read_csv("data/all_transactions.csv")
df2 = pd.read_csv("data/users.csv")
df3 = pd.read_csv("data/devices.csv")
df4 = pd.merge(df2, df3[df3.brand != "Unknown"], on='user_id')
data = pd.merge(df4, df, on='user_id')
# df = pd.merge(df, df3, on='user_id')
# data = pd.merge(df, df2, on='user_id')

**Renaming columns**

In [None]:
data = data.rename(columns={"created_date_x": "user_created_date", "created_date_y": "transaction_created_date", "country": "user_country"})

**Removing unnecessary columns**

In [None]:
data = data.drop(['city', 'attributes_notifications_marketing_push', 'attributes_notifications_marketing_email', 'num_successful_referrals', 'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country', 'num_referrals'], axis=1)

**Column Tranformations**

In [None]:
plan_ = {"STANDARD": 0,"SILVER": 1,"GOLD": 1}
data['plan'] = data['plan'].map(plan_)

In [None]:
currency_ = {'AED' : 0, 'SEK' : 0, 'AUD' : 0, 'GBP' : 0, 'ETH' : 1, 'RUB' : 0, 'CHF' : 0, 'HRK' : 0, 'LTC' : 1, 'MAD' : 0, 'BTC' : 1, 'NZD' : 0, 'JPY' : 0, 'ILS' : 0, 'QAR' : 0, 'MXN' : 0, 'DKK' : 0, 'SGD' : 0, 'ZAR' : 0, 'BGN' : 0, 'USD' : 0, 'INR' : 0, 'THB' : 0, 'RON' : 0, 'HUF' : 0, 'TRY' : 0, 'XRP' : 1, 'PLN' : 0, 'EUR' : 0, 'BCH' : 1, 'CZK' : 0, 'CAD' : 0, 'NOK' : 0, 'HKD' : 0, 'SAR' : 0}
data['transactions_currency'] = data['transactions_currency'].map(currency_)

In [None]:
currency_ = {'FAILED': 0, 'DECLINED': 0, 'COMPLETED': 1, 'REVERTED': 2, 'PENDING': 0, 'CANCELLED': 0}
data['transactions_state'] = data['transactions_state'].map(currency_)

In [None]:
brands = {"Android": 0,"Apple": 1}
data['brand'] = data['brand'].map(brands)

In [None]:
# SWEU --> south western eu
# EEU --> Eastern Europe
# CEU --> Central Europe
# SWEU --> South Western Europe
# WEU --> Western Europe
# NEU ---> Norhern Europe
# AS --> ASIA
# NA --> northamerica
# AF --> Africa
# OC --> Oceania

#(oceania,namerica,africa,SEU)
#(SWEU,CEU,NEU,EEU)
#(Western Europe)


continents_ = {'ES':'SWEU', 'LT':'EEU', 'IE':'WEU', 'GB':'WEU', 'MT':'SWEU', 'FR':'SWEU', 'RO':'SWEU', 'PL':'EEU', 'AT':'CEU', 'IT':'SWEU', 'SI':'CEU', 'CZ':'CEU', 'BE':'WEU', 'BG':'SWEU', 'GI':'SWEU', 'CH':'AS', 'GG':'SWEU', 'CY':'SWEU', 'DE':'CEU', 'SK':'CEU', 'GR':'SEU', 'DK':'NEU', 'PT':'SWEU', 'LU':'NEU', 'HR':'SWEU', 'NL':'NEU', 'SE':'NEU', 'LV':'EEU', 'IM':'NEU', 'FI':'NEU', 'EE':'EEU', 'JE':'SWEU', 'HU':'CEU', 'NO':'NEU', 'RE':'AF', 'IS':'NEU', 'GP':'NA', 'AU':'OC', 'LI':'EEU', 'MQ':'NA'}
data['user_country'] = data['user_country'].map(continents_)


#Appending new Continents
new_continents_ = {'OC':'rest_of_the_world','NA':'rest_of_the_world','AF':'rest_of_the_world','SEU':'rest_of_the_world','AS':'rest_of_the_world',
               'SWEU':'Central_Europe','CEU':'Central_Europe','NEU':'Central_Europe','EEU':'Central_Europe',
                 'WEU':'Western_Europe'}

data['user_country'] = data['user_country'].map(new_continents_)


# Renaming Column
data = data.rename(columns={"user_country": "continent"})

In [None]:
data = data[(data.transactions_state == 1) | (data.transactions_state == 2)]

In [None]:
data['continent'].value_counts()

In [None]:
data['continent'].isna().sum()

**Plots**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
carrier_count = df2['country'].value_counts()
carrier_count.plot(figsize=(20,15))
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)

plt.title('Users Per Country',fontsize=19)
plt.ylabel('Number of Users', fontsize=19)
plt.xlabel('Country', fontsize=14)
plt.xticks(rotation=60,fontsize=14)

plt.show()

In [None]:
carrier_count = data['continent'].value_counts()
carrier_count.plot(figsize=(10,10))
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)

plt.title('Transactions Per Continent',fontsize=19)
plt.ylabel('Number of transactions', fontsize=19)
plt.xticks(fontsize=14)

plt.show()

**Aggregating**

In [None]:
def age_bands(year):
    year = 2020 - year
    if year < 25:
        return 0
    elif year < 35:
        return 1
    elif year < 45:
        return 2
    elif year < 55:
        return 3
    elif year < 65:
        return 4
    return 5

In [None]:
# Demographics
users_agg_df = data.groupby(['user_id']).agg({'birth_year': 'first', 'continent': 'first', 'user_created_date': 'first', 'user_settings_crypto_unlocked': 'first', 'plan': 'first', 'num_contacts': 'first', 'brand': 'first'}).reset_index()
users_agg_df["birth_year"] = users_agg_df["birth_year"].apply(lambda x: age_bands(x))
users_agg_df

In [None]:
# First and last transaction of each user
first_transactions_date_agg = data.groupby(["user_id"]).agg({'transaction_created_date': 'min'}).reset_index().rename(columns={"transaction_created_date": "first_transaction_date"})
last_transactions_date_agg = data.groupby(["user_id"]).agg({'transaction_created_date': 'max'}).reset_index().rename(columns={"transaction_created_date": "last_transaction_date"})
transactions_date_agg = pd.merge(first_transactions_date_agg, last_transactions_date_agg, on="user_id")
transactions_date_agg

In [None]:
# Total transactions
user_transactions = data.groupby(["user_id"]).agg({'transaction_created_date': 'count'}).reset_index().rename(columns={"transaction_created_date": "n_transactions"})
user_transactions

In [None]:
# Total amounts of each user
transactions_total_agg = data[data.transactions_state == 1].groupby(["user_id"]).agg({'amount_usd': 'sum'}).reset_index()
transactions_total_agg = transactions_total_agg.rename(columns={"amount_usd": "amount_total"})
transactions_total_rev = data[data.transactions_state == 2].groupby(["user_id"]).agg({'amount_usd': 'sum'}).reset_index().rename(columns={'amount_usd': 'amount_reverted'})


transactions_total_agg = pd.merge(transactions_total_agg, transactions_total_rev, on="user_id", how="left").fillna(0.0)#converting NaN to 0

transactions_total_agg["total_amount"] = transactions_total_agg["amount_total"] - transactions_total_agg["amount_reverted"]
transactions_total_agg = transactions_total_agg[["user_id", "total_amount"]]
transactions_total_agg

In [None]:
# Final df for classification
classification_df = pd.merge(users_agg_df, transactions_date_agg, on='user_id')
classification_df = pd.merge(classification_df, transactions_total_agg, on='user_id')
classification_df = pd.merge(classification_df, user_transactions, on='user_id')
dum = pd.get_dummies(classification_df.continent)
classification_df = pd.concat([classification_df, dum], axis=1)
classification_df['label'] = classification_df['plan']
classification_df = classification_df.drop(columns=['continent', 'plan']).rename(columns={'birth_year': 'age_band'})
classification_df

**Train/test splitting**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = classification_df[['age_band', 'user_settings_crypto_unlocked', 'num_contacts', 'brand', 'total_amount', 'n_transactions', 'Central_Europe', 'Central_Europe', 'rest_of_the_world']]
y = classification_df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

**Scaling**

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
scaler = RobustScaler().fit(x_train)

In [None]:
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

**Balancing**

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
print(f'Number of users with plan 0 pre-balancing: {len(y_train[y_train == 0])} and with plan 1: {len(y_train[y_train == 1])}')

In [None]:
sm = SMOTE(random_state=10)
x_train_res, y_train_res = sm.fit_resample(x_train_scaled, y_train)

In [None]:
print(f'Number of users with plan 0 post-balancing: {len(y_train_res[y_train_res == 0])} and with plan 1: {len(y_train_res[y_train_res == 1])}')

**Models**

In [None]:
from ml import MLModels

In [None]:
lr_model = MLModels(x_train_res, x_test, y_train_res, y_test)
lr_model.logistic_regression()
lr_model.grid_search()

**Tests and raw code**

In [None]:
len(data[(data.transactions_state == 2) & (data.direction=="INBOUND")])

In [None]:
pd.crosstab(index=data['transactions_type'],columns=data['transactions_state'])

In [None]:
pd.crosstab(index=data['transactions_type'],columns=data['direction'])

In [None]:
pd.crosstab(index=data['direction'],columns=data['transactions_state'])

In [None]:
data[(data.user_id == "user_7") & (data.transactions_state == 2) & (data.amount_usd == 8.51) & (data.direction == "INBOUND")]

In [None]:
data[data["user_id"] == "user_12038"].sort_values(by='transaction_created_date', ascending=True)

In [None]:
pd.crosstab(index=data['transactions_type'],columns=data['ea_cardholderpresence'])

In [None]:
data['num_contacts'].describe()

In [None]:
data['ea_cardholderpresence'].value_counts()