# 0.0. Understanding the problem

--Goal:
    
    --1.0 Predict the first destination that the new user will choose
    
    --Why?
        --What is the business model of Airbnb?
            --Marketplace (connect people who offer acomodation to whom are seeking a place)
            --Offer
                --Portfolio size
                --Portfolio diversity and density
                --Average price
                
            --Demand
                --Number of users
                --LTV (lifetime value)
                --CAC (client acquisition cost)
                
                Gross Revenue = (Fee*Number of clients) - CAC
                
--Solution:
    
    --Predictive model for the first destination of new user
    --1.0 Prediction save in a database
    --2.0 API
        --Input: User and features
        --Output: User and feature with destination prediction
        

# Module 1

## 0.0. Imports

In [171]:
#!pip install keras
#!pip install tensorflow
#!pip install scikit-plot
#!pip install imblearn
#!pip install delayed
#!pip install pandas-profiling
#!pip install matplotlib

import random
import pickle

import numpy   as np
import pandas  as pd
import seaborn as sns

from sklearn    import model_selection as ms
from sklearn    import preprocessing   as pp
from sklearn    import metrics         as m
from scikitplot import metrics         as mt
from scipy      import stats           as ss
from imblearn   import under_sampling  as us
from imblearn   import over_sampling   as oversamp
from imblearn   import combine         as c
from keras      import models          as ml
from keras      import layers          as l
from matplotlib import pyplot          as plt

from pandas_profiling import ProfileReport

### 0.1. Helper Functions

In [172]:
def cramer_v( x, y ):
    cm = pd.crosstab( x, y ).values
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency( cm )[0]
    chi2corr = max( 0, chi2 - (k-1)*(r-1)/(n-1) )
    
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt( (chi2corr/n) / ( min( kcorr-1, rcorr-1 ) ) )

### 0.2. Loading Data

In [173]:
df_raw = pd.read_csv('dataset/train_users_2.csv', low_memory=True)
df_raw.shape

(213451, 16)

In [None]:
df_sessions = pd.read_csv('dataset/sessions.csv', low_memory=True)
df_sessions.shape

# 1.0. Data Description

In [None]:
df1 = df_raw.copy()

## 1.1. Data Dimensions

In [None]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

In [None]:
print('Number of rows: {}'.format(df_sessions.shape[0]))
print('Number of columns: {}'.format(df_sessions.shape[1]))

## 1.2. Data Type

In [None]:
df1.dtypes

In [None]:
df_sessions.dtypes

## 1.3. NA Check

In [None]:
df1.isna().sum()/len(df1)

In [None]:
df_sessions.isna().sum()/len(df_sessions)

In [None]:
#date_first_booking
date_first_booking_max = pd.to_datetime(df1['date_first_booking']).max().strftime('%Y-%m-%d')
df1['date_first_booking'] = df1['date_first_booking'].fillna(date_first_booking_max)


# ========== User =================
# age
df1 = df1[( df1['age'] > 15 ) & ( df1['age'] < 120 )]
avg_age = df1['age'].mean()
df1['age'] = df1['age'].fillna( avg_age )

#first_affiliate_tracked
df1 = df1[~df1['first_affiliate_tracked'].isna()]


# ========== Sessions =================
#user_id
df_sessions = df_sessions[~df_sessions['user_id'].isna()]

#action
df_sessions = df_sessions[~df_sessions['action'].isna()]

#action_type
df_sessions = df_sessions[~df_sessions['action_type'].isna()]

#action_detail
df_sessions = df_sessions[~df_sessions['action_detail'].isna()]

#secs_elapsed
df_sessions = df_sessions[~df_sessions['secs_elapsed'].isna()]

## 1.4. Change Data Type

In [None]:
 df1.dtypes

In [None]:
#date_account_created
df1['date_account_created'] = pd.to_datetime(df1['date_account_created'])

#timestamp_first_active
df1['timestamp_first_active'] = pd.to_datetime(df1['timestamp_first_active'], format='%Y%m%d%H%M%S')

#date_first_booking
df1['date_first_booking'] = pd.to_datetime(df1['date_first_booking'])

#age
df1['age'] = df1['age'].astype(int)

## 1.5. Check Balanced Data

In [None]:
df1['country_destination'].value_counts(normalize=True)

## 1.6. Descriptive Analysis

In [None]:
#Users
num_attributes = df1.select_dtypes(include=['int64','float64'])
cat_attributes = df1.select_dtypes(exclude=['int64','float64','datetime64[ns]'])
time_attributes = df1.select_dtypes(include=['datetime64[ns]'])

#Sessions
num_attributes_sessions = df_sessions.select_dtypes(include=['int64','float64'])
cat_attributes_sessions = df_sessions.select_dtypes(exclude=['int64','float64','datetime64[ns]'])
time_attributes_sessions = df_sessions.select_dtypes(include=['datetime64[ns]'])

### 1.6.1 Numerical - Users

In [None]:
#Central Tendency - Mean, Median
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

#Dispersions - Std, Min, Max, Range, Skew, Kurtosis
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(min)).T
d3 = pd.DataFrame(num_attributes.apply(max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

#Concat
ct = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
ct.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
ct

### 1.6.2 Numerical - Sessions

In [None]:
#Central Tendency - Mean, Median
ct1 = pd.DataFrame(num_attributes_sessions.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes_sessions.apply(np.median)).T

#Dispersions - Std, Min, Max, Range, Skew, Kurtosis
d1 = pd.DataFrame(num_attributes_sessions.apply(np.std)).T
d2 = pd.DataFrame(num_attributes_sessions.apply(min)).T
d3 = pd.DataFrame(num_attributes_sessions.apply(max)).T
d4 = pd.DataFrame(num_attributes_sessions.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes_sessions.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes_sessions.apply(lambda x: x.kurtosis())).T

#Concat
ct = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
ct.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
ct

### 1.6.3. Categorical - Users

In [None]:
cat_attributes.drop(['id','age'], axis=1).describe()

### 1.6.4 - Categorical - Sessions

In [None]:
cat_attributes_sessions.drop('user_id', axis=1).describe()

In [None]:
#list of attributes for Cramer's V correlation
cat_attributes_list = cat_attributes_sessions.drop('user_id', axis=1).columns.tolist()

corr_dict = {}
for i in range(len(cat_attributes_list)):
    corr_list = []
    for j in range(len(cat_attributes_list)):
        ref = cat_attributes_list[i]
        feat = cat_attributes_list[j]
        
        # correlation
        corr = cramer_v(cat_attributes_sessions[ref], cat_attributes_sessions[feat])
        
        # append a list
        corr_list.append(corr)
    
    # appende a correlation list for each ref attributs
    corr_dict[ref] = corr_list

In [None]:
corr_dict[feat]

In [None]:
corr_dict[ref]

In [None]:
d = pd.DataFrame(corr_dict)
d = d.set_index(d.columns)
sns.heatmap(d, annot=True)

# 2.0. Feature Engineering

In [None]:
df2 = df1.copy()

## 2.1. Create New Features

In [None]:
#days from first active up to first booking
df2['first_active'] = pd.to_datetime(df2['timestamp_first_active'].dt.strftime('%Y-%m-%d'))
df2['days_from_first_active_until_booking'] = (df2['date_first_booking'] - df2['first_active']).apply(lambda x: x.days)

#days from first active upt to account created
df2['days_from_first_active_until_account_created'] = (df2['date_account_created'] - df2['first_active']).apply(lambda x: x.days)

#days from account createad up to first booking
df2['days_from_account_created_until_first_booking'] = (df2['date_first_booking'] - df2['date_account_created']).apply(lambda x: x.days)


# ================== First Active ==================
#year first active
df2['year_first_active'] = df2['first_active'].dt.year

#month first active
df2['month_first_active'] = df2['first_active'].dt.month

#day first active
df2['day_first_active'] = df2['first_active'].dt.day

#day of week first active
df2['day_of_week_first_active'] = df2['first_active'].dt.dayofweek

#week of year first active
df2['week_of_year_first_active'] = df2['first_active'].dt.isocalendar().week
df2['week_of_year_first_active'] = np.asarray(df2['week_of_year_first_active']).astype(np.int64)


# ================== First Booking ==================
#year first booking
df2['year_first_booking'] = df2['date_first_booking'].dt.year

#month first booking
df2['month_first_booking'] = df2['date_first_booking'].dt.month

#day first booking
df2['day_first_booking'] = df2['date_first_booking'].dt.day

#day of week first booking
df2['day_of_week_first_booking'] = df2['date_first_booking'].dt.dayofweek

#week of year first booking
df2['week_of_year_first_booking'] = df2['date_first_booking'].dt.isocalendar().week
df2['week_of_year_first_booking'] = np.asarray(df2['week_of_year_first_booking']).astype(np.int64)


# ================== First Account Created =================
#year first booking
df2['year_account_created'] = df2['date_account_created'].dt.year

#month  account_created
df2['month_account_created'] = df2['date_account_created'].dt.month

#day  account_created
df2['day_account_created'] = df2['date_account_created'].dt.day

#day of week  account_created
df2['day_of_week_account_created'] = df2['date_account_created'].dt.dayofweek

#week of year  account_created
df2['week_of_year_account_created'] = df2['date_account_created'].dt.isocalendar().week
df2['week_of_year_account_created'] = np.asarray(df2['week_of_year_account_created']).astype(np.int64)

In [None]:
df2.dtypes

# 3.0. Data Filtering

In [None]:
df3 = df2.copy()

In [None]:
df2.head().T

## 3.1. Filtering Rows

In [None]:
# Filtering rows:
# age - greater than 15 and lower than 120 - There are few people over 12O year old   
df3 = df3[( df3['age'] > 15 ) & ( df3['age'] < 120 )]

## 3.2. Columns Selection

In [None]:
cols = ['date_account_created', 'date_account_created', 'date_first_booking', 'timestamp_first_active', 'first_active'] # original datetime
df3 = df3.drop(cols, axis=1)

# 4.0. Balanced Dataset

In [None]:
df4 = df3.copy()
#df4.shape

In [None]:
#Encoder Categorical Variables
ohe = pp.OneHotEncoder()

#Numerical
col_num = df4.select_dtypes(include=['int64', 'float64']).columns.tolist()

#Categorical
col_cat = df4.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]']).drop(['id', 'country_destination'], axis=1).columns.tolist()

#encoding
df4_dummy = pd.DataFrame(ohe.fit_transform( df4[ col_cat]).toarray(), index=df4.index)

#join numerical and categorical
df42 = pd.concat([df4[col_num], df4_dummy], axis=1)
df42.shape

## 4.1. Random Undersampling

In [None]:
#ratio_balanced
ratio_balanced = {'NDF': 10000}

#define sampler
undersampling = us.RandomUnderSampler(sampling_strategy=ratio_balanced, random_state=32)

#apply sampler
X_under, y_under = undersampling.fit_resample(df42, df4['country_destination'])

In [None]:
df4['country_destination'].value_counts()

In [None]:
y_under.value_counts()

## 4.2. Random Oversampling

In [None]:
#define sampler
oversampling = oversamp.RandomOverSampler(sampling_strategy='all', random_state=32)

#apply sampler
X_over, y_over = oversampling.fit_resample(df42, df4['country_destination'])

In [None]:
df4['country_destination'].value_counts()

In [None]:
y_over.value_counts()

## 4.3. SMOTE + TOMEKLINK

In [None]:
ratio_balanced =  {'NDF': 54852,
                   'US':  48057,
                   'other': 6*7511,
                   'FR': 12*3669,
                   'IT': 20*2014,
                   'GB': 30*1758,
                   'ES': 30*1685,
                   'CA': 40*1064,
                   'DE': 45*841,
                   'NL': 80*595,
                   'AU': 85*433,
                   'PT': 300*157}

In [None]:
#define sampler
smt = c.SMOTETomek(sampling_strategy=ratio_balanced, random_state=32, n_jobs=-1)

In [None]:
#apply sampler
#X_smt, y_smt = smt.fit_resample(df42, df4['country_destination'])

In [None]:
#pickle.dump(X_smt, open('X_smt.pkl','wb'))
#pickle.dump(y_smt, open('y_smt.pkl','wb'))

In [None]:
df4['country_destination'].value_counts()

In [None]:
y_over.value_counts()

In [None]:
X_smt = pickle.load(open('C:/Users/Henrique/repos/Airbnb/airbnb_predict_first_booking/X_smt.pkl', 'rb'))
y_smt = pickle.load(open('C:/Users/Henrique/repos/Airbnb/airbnb_predict_first_booking/y_smt.pkl', 'rb'))

In [None]:
#numerical data
df43 = X_smt[ col_num ]

#categorical data
df44 = X_smt.drop(col_num, axis=1)
df45 = pd.DataFrame(ohe.inverse_transform(df44), columns=col_cat, index=df44.index)

#join numerical categorical
df46 = pd.concat([df43, df45], axis=1)
df46['country_destination'] = y_smt

# 5.0. Exploratory Data Analysis

In [None]:
 df51 = df46.copy() #balanced dataset

In [None]:
 df52 = df4.copy() #unbalanced dataset

In [None]:
aux03.sum()/len(aux03)

In [None]:
df52.dtypes.T

## 5.1. Hypothesis Validation - Unbalanced Dataset

**H01.** Em todos os destinos, os usuários levam 15 dias, em média, para fazer a primeira reserva no Airbnb, desde sua primeira ativacao.

**Verdadeiro.** Em todos os destinos, os usuários até 6 dias para reservar o primeiro Airbnb

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(3, 1, 1)
aux01 = df52[['days_from_first_active_until_booking', 'country_destination']].groupby('country_destination').median().reset_index()
sns.barplot(x='country_destination', y='days_from_first_active_until_booking', 
             data=aux01.sort_values('days_from_first_active_until_booking'))

# remove outlier
plt.subplot(3, 1, 2)
aux02 = df52[(df52['country_destination'] != 'NDF') & (df52['country_destination'] != 'other')]
aux02 = aux02[['days_from_first_active_until_booking', 'country_destination']].groupby('country_destination').median().reset_index()
sns.barplot( x='country_destination', y='days_from_first_active_until_booking', 
             data=aux02.sort_values('days_from_first_active_until_booking'))

In [None]:
#aux03 = df52[(df52['days_from_first_active_until_booking'] < 10) & (df52['country_destination'] == 'US')]['days_from_first_active_until_booking']
#plt.boxplot(aux03)

**H02.** Em todos os destinos, os usuários levam 3 dias, em média, para fazer o cadastro no site.

**Verdadeira.** Em todos os destinos, os usuários levam até 2 dias para finalizar o cadastro

In [None]:
plt.figure(figsize=(20, 12))
aux01 = df52[['days_from_first_active_until_account_created', 'country_destination']].groupby('country_destination').mean().reset_index()
sns.barplot(x='country_destination', y='days_from_first_active_until_account_created', 
             data=aux01.sort_values('days_from_first_active_until_account_created'))

**H03.** O volume de reservas anual feitas durante o verão aumentaram 20% para destinos dentro dos USA.

**False.** O Volume de reservas aumenta durante o verão entre os anos de 2010 até 2013.

In [None]:
aux01 = df52[['year_first_booking', 'month_first_booking', 'country_destination']].\
                groupby(['year_first_booking', 'month_first_booking', 'country_destination']). \
                size().reset_index().rename(columns={0:'count'})

# select only summer
aux01 = aux01[(aux01['month_first_booking'].isin([7, 8, 9])) & (aux01['country_destination'] == 'US')]

aux02 = aux01[['year_first_booking', 'count']].groupby('year_first_booking').sum().reset_index()

aux02['delta'] = 100*aux02['count'].pct_change().fillna(0)

plt.figure(figsize=(20,12))
sns.barplot(x='year_first_booking', y='delta', data=aux02)

In [None]:
perc_dict = {}

print(len(aux02['count']))

for i in range(len(aux02['count'])):
    if i != 4:
        print(i)
        y = aux02['year_first_booking'][i+1]
        perc = ((aux02['count'][i+1]*100)/(aux02['count'][i]))-100
        perc_dict[y] = perc
    else:
        exit

In [None]:
perc_dict

In [None]:
proof = ProfileReport(df51)
proof.to_notebook_iframe()

# 5.0. Data Preparation

In [None]:
df5 = df4.copy()

In [None]:
#dummy variable
df5_dummy = pd.get_dummies(df5.drop(['id','country_destination'], axis=1))

#join id and country destination
df5 = pd.concat([df5[['id','country_destination']], df5_dummy], axis=1)

In [None]:
df5.shape

# 6.0. Feature Seleciotn

In [None]:
#cols_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'first_active'] # original dates
df6 = df5.drop(cols_drop, axis=1)

In [None]:
X = df6.drop(['id', 'country_destination'], axis=1)
y = df6['country_destination'].copy()

In [None]:
#split dataset into train and test
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=32)

# 7.0. Machine Learning

## 7.1. Baseline Model

In [None]:
country_destination_list = df1['country_destination'].drop_duplicates().sort_values().tolist()
k_num = y_test.shape[0]
country_destination_weights = df1['country_destination'].value_counts(normalize=True).sort_index().tolist()

yhat_random = random.choices(population=country_destination_list, 
                             weights=country_destination_weights,
                             k=k_num)

### 7.1.1. Baseline Performance

In [None]:
#accuracy
acc_random = m.accuracy_score(y_test, yhat_random)
print('Accuracy: {}'.format(acc_random))

#balanced accuracy
balanced_acc_random = m.balanced_accuracy_score(y_test, yhat_random)
print('Balanced Accuracy: {}'.format(balanced_acc_random))

#Kappa metrics
kappa_random = m.cohen_kappa_score(y_test, yhat_random)
print('Kappa Accuracy: {}'.format(kappa_random))

#Classification report
print( m.classification_report( y_test, yhat_random ) )

#Confusion Matrix
mt.plot_confusion_matrix( y_test, yhat_random, normalize=False, figsize=(12,12))

## 7.2. Neural Network - MLP

In [None]:
ohe = pp.OneHotEncoder()
y_train_nn = ohe.fit_transform(y_train.values.reshape(-1, 1)).toarray()

In [None]:
X_train.shape

In [None]:
# model definition
model = ml.Sequential()
model.add(l.Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(l.Dense(12, activation='softmax'))

# model compile
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train model
model.fit(X_train, y_train_nn, epochs=100)

### 7.2.1. NN Performance

In [None]:
#prediction
pred_nn = model.predict(X_test)

#invert prediction
yhat_nn = ohe.inverse_transform(pred_nn)

#prediction prepare
y_test_nn = y_test.to_numpy()
yhat_nn = yhat_nn.reshape(1, -1)[0]

In [None]:
#accuracy
acc_nn = m.accuracy_score(y_test_nn, yhat_nn)
print('Accuracy: {}'.format(acc_nn))

#balanced accuracy
balanced_acc_nn = m.balanced_accuracy_score(y_test_nn, yhat_nn)
print('Balanced Accuracy: {}'.format(balanced_acc_nn))

#Kappa metrics
kappa_nn = m.cohen_kappa_score(y_test_nn, yhat_nn)
print('Kappa Accuracy: {}'.format(kappa_nn))

#confusion matrix
mt.plot_confusion_matrix(y_test_nn, yhat_nn, normalize=False, figsize=(12,12))

### 7.2.2. NN Performance - Cross-Validation

In [None]:
# generate k-fold
num_folds = 5
kfold = ms.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=32)

balanced_acc_list = []
kappa_acc_list = []

i = 1

for train_ix, val_ix in kfold.split(X_train, y_train):
    print('Fold Number: {}/{}'.format(i, num_folds))
    
    #get fold
    x_train_fold = X_train.iloc[train_ix]
    y_train_fold = y_train.iloc[train_ix]
    
    x_val_fold = X_train.iloc[val_ix]
    y_val_fold = y_train.iloc[val_ix]
    
    #target hot-enconding
    ohe = pp.OneHotEncoder()
    y_train_fold_nn = ohe.fit_transform(y_train_fold.values.reshape(-1,1)).toarray()
    
    #model definition
    model = ml.Sequential()
    model.add(l.Dense(256, input_dim=X_train.shape[1], activation='relu'))
    model.add(l.Dense(12, activation='softmax'))
    
    #compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    #training model
    model.fit(x_train_fold, y_train_fold_nn, epochs=100, batch_size=32, verbose=0)
    
    #prediction
    pred_nn = model.predict(x_val_fold)
    yhat_nn = ohe.inverse_transform(pred_nn)
    
    #prepare data
    y_test_nn = y_val_fold.to_numpy()
    yhat_nn = yhat_nn.reshape(1, -1)[0]
    
    #metrics
    ##Balanced Accuracy
    baanced_acc_nn = m.balanced_accuracy_score(y_test_nn, yhat_nn)
    balanced_acc_list.append(balanced_acc_nn)
    
    ##Kappa Metrics
    kappa_acc_nn = m.cohen_kappa_score(y_test_nn, yhat_nn)
    kappa_acc_list.append(kappa_acc_nn)
    
    i+=1


In [None]:
pickle.dump(balanced_acc_list, open('balanced_acc_list.pkl','wb'))
pickle.dump(kappa_acc_list, open('kappa_acc_list.pkl','wb'))

In [None]:
print('Avg Balanced Accuracy: {} +/- {}'.format(np.round(np.mean(balanced_acc_list),2),
                                                np.round(np.std(balanced_acc_list),4)))
print('Avg Kappa Accuracy: {} +/- {}'.format(np.round(np.mean(kappa_acc_list),2),
                                             np.round(np.std(kappa_acc_list)),4))