In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
from datetime import date

%matplotlib inline

In [None]:
df = pd.read_csv("test/transactions_test.csv")
df2 = pd.read_csv("test/users_test.csv")
df3 = pd.read_csv("test/devices_test.csv")
df4 = pd.merge(df2, df3[df3.brand != "Unknown"], on='user_id')
data = pd.merge(df4, df, on='user_id')

In [None]:
data = data.rename(columns={"created_date_x": "user_created_date", "created_date_y": "transaction_created_date", "country": "user_country"})

In [None]:
data = data.drop(['city', 'attributes_notifications_marketing_push', 'attributes_notifications_marketing_email', 'num_successful_referrals', 'ea_merchant_mcc', 'ea_merchant_city', 'ea_merchant_country', 'num_referrals'], axis=1)

In [None]:
currency_ = {'FAILED': 0, 'DECLINED': 0, 'COMPLETED': 1, 'REVERTED': 2, 'PENDING': 0, 'CANCELLED': 0}
data['transactions_state'] = data['transactions_state'].map(currency_)

In [None]:
brands = {"Android": 0,"Apple": 1}
data['brand'] = data['brand'].map(brands)

In [None]:
continents_ = {'ES':'SWEU', 'LT':'EEU', 'IE':'WEU', 'GB':'WEU', 'MT':'SWEU', 'FR':'SWEU', 'RO':'SWEU', 'PL':'EEU', 'AT':'CEU', 'IT':'SWEU', 'SI':'CEU', 'CZ':'CEU', 'BE':'WEU', 'BG':'SWEU', 'GI':'SWEU', 'CH':'AS', 'GG':'SWEU', 'CY':'SWEU', 'DE':'CEU', 'SK':'CEU', 'GR':'SEU', 'DK':'NEU', 'PT':'SWEU', 'LU':'NEU', 'HR':'SWEU', 'NL':'NEU', 'SE':'NEU', 'LV':'EEU', 'IM':'NEU', 'FI':'NEU', 'EE':'EEU', 'JE':'SWEU', 'HU':'CEU', 'NO':'NEU', 'RE':'AF', 'IS':'NEU', 'GP':'NA', 'AU':'OC', 'LI':'EEU', 'MQ':'NA', 'GF':'LA'}
data['user_country'] = data['user_country'].map(continents_)


#Appending new Continents
new_continents_ = {'OC':'rest_of_the_world','NA':'rest_of_the_world','AF':'rest_of_the_world','SEU':'rest_of_the_world','AS':'rest_of_the_world',
               'SWEU':'Central_Europe','CEU':'Central_Europe','NEU':'Central_Europe','EEU':'Central_Europe',
                 'WEU':'Western_Europe', 'LA':'rest_of_the_world'}

data['user_country'] = data['user_country'].map(new_continents_)


# Renaming Column
data = data.rename(columns={"user_country": "continent"})

In [None]:
data = data[(data.transactions_state == 1) | (data.transactions_state == 2)]

In [None]:
carrier_count = data['continent'].value_counts()
carrier_count.plot(figsize=(10,10))
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)

plt.title('Transactions Per Continent',fontsize=19)
plt.ylabel('Number of transactions', fontsize=19)
plt.xticks(fontsize=14)

plt.show()

In [None]:
def age_bands(year):
    year = 2020 - year
    if year < 25:
        return 0
    elif year < 35:
        return 1
    elif year < 45:
        return 2
    elif year < 55:
        return 3
    elif year < 65:
        return 4
    return 5

In [None]:
# Demographics
users_agg_df = data.groupby(['user_id']).agg({'birth_year': 'first', 'continent': 'first', 'user_created_date': 'first', 'user_settings_crypto_unlocked': 'first', 'num_contacts': 'first', 'brand': 'first'}).reset_index()
users_agg_df["birth_year"] = users_agg_df["birth_year"].apply(lambda x: age_bands(x))
users_agg_df

In [None]:
first_transactions_date_agg = data.groupby(["user_id"]).agg({'transaction_created_date': 'min'}).reset_index().rename(columns={"transaction_created_date": "first_transaction_date"})
last_transactions_date_agg = data.groupby(["user_id"]).agg({'transaction_created_date': 'max'}).reset_index().rename(columns={"transaction_created_date": "last_transaction_date"})
transactions_date_agg = pd.merge(first_transactions_date_agg, last_transactions_date_agg, on="user_id")
transactions_date_agg

In [None]:
user_transactions = data.groupby(["user_id"]).agg({'transaction_created_date': 'count'}).reset_index().rename(columns={"transaction_created_date": "n_transactions"})
user_transactions

In [None]:
transactions_total_agg = data[data.transactions_state == 1].groupby(["user_id"]).agg({'amount_usd': 'sum'}).reset_index()
transactions_total_agg = transactions_total_agg.rename(columns={"amount_usd": "amount_total"})
transactions_total_rev = data[data.transactions_state == 2].groupby(["user_id"]).agg({'amount_usd': 'sum'}).reset_index().rename(columns={'amount_usd': 'amount_reverted'})


transactions_total_agg = pd.merge(transactions_total_agg, transactions_total_rev, on="user_id", how="left").fillna(0.0)#converting NaN to 0

transactions_total_agg["total_amount"] = transactions_total_agg["amount_total"] - transactions_total_agg["amount_reverted"]
transactions_total_agg = transactions_total_agg[["user_id", "total_amount"]]
transactions_total_agg

In [None]:
classification_df = pd.merge(users_agg_df, transactions_date_agg, on='user_id')
classification_df = pd.merge(classification_df, transactions_total_agg, on='user_id')
classification_df = pd.merge(classification_df, user_transactions, on='user_id')
classification_df.head()

In [None]:
classification_df['first_transaction_date'] = pd.to_datetime(classification_df['first_transaction_date'])
classification_df['last_transaction_date'] = pd.to_datetime(classification_df['last_transaction_date'])

delta = classification_df['last_transaction_date'] - classification_df['first_transaction_date']
classification_df['Days_Since_First_Transaction'] = delta.dt.days.astype(int)

In [None]:
last_day = max(classification_df.last_transaction_date)

In [None]:
delta = last_day - classification_df['last_transaction_date']
classification_df['Days_Since_Last_Transaction'] = delta.dt.days.astype(int)

In [None]:
classification_df.head()

In [None]:
classification_df['user_created_date'] = pd.to_datetime(classification_df['user_created_date'])

delta = classification_df['last_transaction_date']-classification_df['user_created_date']
classification_df['Days_User_is_active'] = delta.dt.days.astype(int)

In [None]:
classification_df.dtypes

In [None]:
dum = pd.get_dummies(classification_df.continent)
classification_df = pd.concat([classification_df, dum], axis=1)
classification_df = classification_df.drop(columns=['continent']).rename(columns={'birth_year': 'age_band'})
classification_df.head()

In [None]:
import pickle

In [None]:
x = classification_df[['total_amount', 'n_transactions', 'age_band', 'user_settings_crypto_unlocked', 'Days_Since_First_Transaction', 'Days_Since_Last_Transaction', 'Days_User_is_active', 'Central_Europe', 'Western_Europe', 'rest_of_the_world']]
users = classification_df['user_id']

In [None]:
scaler = pickle.load(open('scaler.pkl', 'rb'))

In [None]:
x_scaled = scaler.transform(x)

In [None]:
x_scaled

In [None]:
model = pickle.load(open('rf.pkl', 'rb'))

In [None]:
y_predictions = model.predict(x_scaled)
y_predictions

In [None]:
unique_elements, counts_elements = np.unique(y_predictions, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

In [None]:
model1 = pickle.load(open('gnb.pkl', 'rb'))
y_predictions1 = model1.predict(x_scaled)
unique_elements1, counts_elements1 = np.unique(y_predictions1, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements1, counts_elements1)))

In [None]:
df_pred = pd.DataFrame(data={'user_id':users, 'prediction':y_predictions})
df_pred.to_csv('predictions/test_preds.csv', index=False)

In [None]:
df_pred1 = pd.DataFrame(data={'user_id':users, 'prediction':y_predictions1})
df_pred1.to_csv('predictions/test_preds1.csv', index=False)