In [None]:
import sys
print(sys.version)

In [None]:


%time

import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib
from matplotlib import pyplot
import matplotlib.pyplot as plt
import os
import json
from pandas.io.json import json_normalize
import datetime
from datetime import datetime
from ast import literal_eval
import copy
import pydot
import warnings
import gc

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn import neighbors
from sklearn.exceptions import DataConversionWarning

from keras import models
from keras import optimizers
from keras.models import *
from keras.layers.recurrent import LSTM
from keras.layers import Input, Dense, Activation, Reshape, Concatenate, Flatten, concatenate
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import plot_model

from bokeh.core.properties import value
from bokeh.io import show, output_file, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, LinearInterpolator, CategoricalColorMapper
from bokeh.transform import dodge
from bokeh.resources import INLINE
TOOLS = 'crosshair,save,pan,box_zoom,reset,wheel_zoom'
output_notebook()

from plotly import __version__
import plotly.offline
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode()

print('Package import complete')

In [None]:

DIR = '../input/'

In [None]:
gc.enable()

features = ['channelGrouping', 'date', 'fullVisitorId', 'visitId',\
       'visitNumber', 'visitStartTime', 'device.browser',\
       'device.deviceCategory', 'device.isMobile', 'device.operatingSystem',\
       'geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country',\
       'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',\
       'geoNetwork.subContinent', 'totals.bounces', 'totals.hits',\
       'totals.newVisits', 'totals.pageviews', 'totals.transactionRevenue',\
       'trafficSource.adContent', 'trafficSource.campaign',\
       'trafficSource.isTrueDirect', 'trafficSource.keyword',\
       'trafficSource.medium', 'trafficSource.referralPath',\
       'trafficSource.source', 'customDimensions']

def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop=True, inplace=True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
        use_df = df[features]
        del df
        gc.collect()
        ans = pd.concat([ans, use_df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans

In [None]:
%%time
#Import training data
csvfile=DIR +'train_v2.csv'
df_train=load_df(csvfile)
df_train.shape

In [None]:
%%time
#Import test data
csvfile=DIR +'test_v2.csv'
df_test=load_df(csvfile)
df_test.shape

In [None]:

train_backup = df_train
test_backup = df_test

print('Train dimension: ',df_train.shape)
print('Test  dimension: ',df_test.shape)

In [None]:

def format_data(df):
     
    #Remplacer les mots redondants par NaN
    word_replace=['(not set)', '(none)', '(direct)', '(not provided)','not available in demo dataset', 'unknown.unknown']
    df = df.replace(word_replace, np.nan)
    
    #Remplacer les valeurs nulles dans la colonne transactionrevenue par '0'
    df['totals_transactionRevenue']=df['totals.transactionRevenue'].fillna(0)
    
    #Remplacer la valeur nulle dans les données numériques par «0» et dans les données catégoriques par «Inconnu»

    fillna_num = ['totals.bounces',
              'totals.newVisits',
              'totals.pageviews',
              'trafficSource.isTrueDirect']

    fillna_cat = ['trafficSource.keyword',
            'trafficSource.referralPath',
            'trafficSource.adContent',
            'trafficSource.source',
            'trafficSource.medium',
            'device.operatingSystem',
            'geoNetwork.networkDomain',
            'geoNetwork.subContinent',
            'geoNetwork.country',
            'geoNetwork.continent',
            'device.browser']

    for col in fillna_num:
        df[col] = df[col].fillna(0)

    for col in fillna_cat:
        df[col] = df[col].fillna('Unknown')
    
    #Imprimer le nombre et le pourcentage de NaN dans dataset
    count = df.isnull().sum().sort_values(ascending = False)
    percentage = ((df.isnull().sum()/df.isnull().count())*100).sort_values(ascending = False)
    unique = df.nunique()
    missing_data = pd.concat([count, percentage, unique], axis=1, keys=['Count', 'Percentage', 'Unique'], sort=False)
    print('Tableau indiquant le pourcentage des données manquantes: \n',missing_data, '\n')
    
    #Supprimer les colonnes contenant plus de 50% de données manquantes
    df = df.drop((missing_data[missing_data['Percentage'] > 50]).index,1)
    
    #Extrait une fonctionnalité de la colonne "visitStartTime", puis rajoute des colonnes à  dataframe 

    df['Date_time'] = pd.to_datetime(df['visitStartTime'].astype(int), unit='s')
    df['Hour'] = df['Date_time'].dt.hour
    df['Day'] = df['Date_time'].dt.day
    df['Day_of_week'] = df['Date_time'].dt.dayofweek
    df['Month'] = df['Date_time'].dt.month
    df['Week_number'] = df['Date_time'].dt.strftime('%V')
    df['Year'] = df['Date_time'].dt.year
    
    df['Date_time'] = pd.to_datetime(df['Date_time'],format='%Y%m%d %H:%M:%S')

    columns_to_drop = ['date','visitStartTime']

    df = df.drop(columns_to_drop, axis=1)
    
    print('Les colonnes contenant plus de 50% des données manquantes ont été supprimées\n')
    print('Shape  dataframe: ',df.shape, '\n')
    print('Nombre le plus élevé de valeurs NULL dans la trame de dataframe: ', max(df.isnull().sum()))
           
    return df

In [None]:
%%time

df_train = format_data(train_backup)

In [None]:
%%time

df_test = format_data(test_backup)

In [None]:
df_train.to_csv('df_train_clean.csv',header = True, index=False)
print('df_train export complete')

In [None]:
df_test.to_csv('df_test_clean.csv',header = True, index=False)
print('df_test export complete')

In [None]:
#Vérifier si les mêmes colonnes apparaissent dans les deux cadres de données
(df_train.columns.intersection(df_test.columns)).shape

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
#Converte  boolean/object vers int

col_to_int = ['device.isMobile', 'Week_number', 'totals.bounces','totals.hits', 'totals.newVisits', 'totals.pageviews']

df_train[col_to_int] = df_train[col_to_int].astype(int)
df_test[col_to_int] = df_test[col_to_int].astype(int)

#Convertit totals_transactionRevenue en un flottant pour log transform
df_train['totals_transactionRevenue'] = df_train['totals_transactionRevenue'].astype ('float64')
df_test['totals_transactionRevenue'] = df_test['totals_transactionRevenue'].astype ('float64')



In [None]:
print(df_train.info())
print(df_test.info())

In [None]:
#Combinez les données de train et de test pour une analyse exploratoire
df_combined = pd.concat([df_train, df_test], axis=0)


In [None]:
df_combined.head()

In [None]:
#Explore revenue distribution accross months over the years
columns = ['Year', 'Month', 'totals_transactionRevenue']
group = df_combined[columns].groupby(['Year', 'Month']).sum().reset_index()

print(group)

plot = sbn.catplot(x = 'Month', y = 'totals_transactionRevenue', hue = 'Year', 
                       palette = ["#3498db","#2ecc71","#f1c40f"], data = group, kind = "bar",
                      height = 6, aspect = 2)

plot.set(xlabel='mois', ylabel='Sum de Revenue', title = 'Répartition des revenus sur plusieurs mois au fil des ans')

plot

In [None]:
columns3 = ['device.browser','device.isMobile']

group3 = df_combined[columns3].groupby(['device.browser','device.isMobile'])\
                              .aggregate({'device.browser':['count']}).reset_index()

group3.columns = ['device.browser', 'device.isMobile', 'browser.count']
group3 = group3.sort_values(by = 'browser.count', ascending = False).head(10)

print('Table showing the top 10 browsers preferred by users:\n')
print(group3)

#Plot top 10 browsers by usage
plot3 = sbn.catplot(x = 'device.browser', y = 'browser.count', hue = 'device.isMobile',
                       palette = ["#3498db","#f1c40f"], data = group3, kind = "bar",
                      height = 6, aspect = 2)

plot3.set(xlabel='Device Browser', ylabel='nombre d utilisation ', title = 'Top 10 des navigateurs préférés des utilisateurs')

print(plot3)

In [None]:
def generate_label(label,id_dfx):
    col_label=['fullVisitorId','totals_transactionRevenue']
    #Select only the id is in df_train for the label
    label = label[label.fullVisitorId.isin(id_dfx)].copy()
    label=label.reset_index(drop=True)
    #drop all columuns else fullvisitorsid and totaltransations
    for c in label.columns:
        if(c not in col_label ):
            label.drop(c,axis=1,inplace=True)
    #Select the id in train not in label       
    id_label = label.fullVisitorId.drop_duplicates()
    not_in_label=list(set(id_dfx) - set(id_label))
    zeros=[0 for c in range(0,len(not_in_label))]
    df_label_0=pd.DataFrame(list(zip(not_in_label, zeros)) ,columns=['fullVisitorId','totals_transactionRevenue'])
    #Contatane te two dataframe
    label=pd.concat([label,df_label_0]).reset_index(drop=True)
    return label

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
def three_month_after(train_begin,train_end,label_begin,label_end,verbose=False):
    tb=datetime.strptime(train_begin,'%Y-%m-%d')+relativedelta(months=3);
    te=datetime.strptime(train_end,'%Y-%m-%d')+relativedelta(months=3);
    lb=datetime.strptime(label_begin,'%Y-%m-%d')+relativedelta(months=3);
    le=datetime.strptime(label_end,'%Y-%m-%d')+relativedelta(months=3);
    if verbose:
        print("train---------------------------------")
        print( 'intial:',datetime.strptime(train_begin,'%Y-%m-%d'))
        print( 'After 3 Month:', tb.strftime('%Y-%m-%d'))
        print( 'intial:',datetime.strptime(train_end,'%Y-%m-%d'))
        print( 'After 3 Month:', te.strftime('%Y-%m-%d'))
        print("label---------------------------------")
        print( 'intial:',datetime.strptime(label_begin,'%Y-%m-%d'))
        print( 'After 3 Month:', lb.strftime('%Y-%m-%d'))
        print( 'intial:',datetime.strptime(label_end,'%Y-%m-%d'))
        print( 'After 3 Month:', le.strftime('%Y-%m-%d'))
    df_x = df_combined[ (df_combined.Date_time >= tb.strftime('%Y-%m-%d')) & (df_combined.Date_time<= te.strftime('%Y-%m-%d'))].copy() #5,5 months(oct ---> march *0.5)
    df_x=df_x.reset_index(drop=True)
    #<-> 1.5 month 
    label = df_combined[(df_combined.Date_time >= lb.strftime('%Y-%m-%d')) & (df_combined.Date_time<= le.strftime('%Y-%m-%d'))].copy() #2 months (may--->june)
    id_train = df_x.fullVisitorId.drop_duplicates()

    #Generate label
    label=generate_label(label,id_train).copy()
    
    return df_x,label

In [None]:
#test
df_test_x = df_combined[(df_combined.Date_time >= "2018-05-01") & (df_combined.Date_time <= "2018-10-15")].copy()
df_test_x =df_test_x.reset_index(drop=True)
#df_test_y= "sample submission"

#fold1
#----------------------------train
col=['fullVisitorId','totals_transactionRevenue']
#train1 
#df_train1_x_agg.shape (377186, 67)
df_train1_x = df_combined[df_combined.Date_time <= "2017-01-15"].copy() #5,5 months(august ---> jan *0.5)
df_train1_x=df_train1_x.reset_index(drop=True)
#<-> 1.5 month (jan *0.5, fev)
label_1 = df_combined[(df_combined.Date_time >= "2017-03-01") & (df_combined.Date_time <= "2017-04-30")].copy() #2 months (march--->april)
id_train1 = df_train1_x.fullVisitorId.drop_duplicates()

#Generate label
label_1=generate_label(label_1,id_train1).copy()

In [None]:
df_train2_x,label_2=three_month_after("2016-08-01","2017-01-15","2017-03-01","2017-04-30",verbose=True)

In [None]:
df_train3_x,label_3=three_month_after("2016-11-01","2017-04-15","2017-06-01","2017-07-30",verbose=True)

In [None]:
df_train4_x,label_4=three_month_after("2017-02-01","2017-07-15","2017-09-01","2017-10-30",verbose=True)

In [None]:
#train5 
df_train5_x,label_5=three_month_after("2017-05-01","2017-10-15","2017-12-01","2018-01-30",verbose=True)

In [None]:
df_train6_x,label_6=three_month_after("2017-08-01","2018-01-15","2018-03-01","2018-04-30",verbose=True)


In [None]:
def group_by_fullVistorsId(df_x,y):
    cat_cols=['channelGrouping', 
       'device.browser', 'device.deviceCategory',
       'device.operatingSystem', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.networkDomain', 'geoNetwork.subContinent',
       'trafficSource.adContent', 'trafficSource.isTrueDirect',
       'trafficSource.keyword', 'trafficSource.medium',
       'trafficSource.referralPath', 'trafficSource.source',
       'customDimensions']
    eng_cols= []
    last_cols = cat_cols + ["fullVisitorId","Date_time"]
    df_x_agg_last = df_x[last_cols].groupby("fullVisitorId",as_index=False).last().sort_values("fullVisitorId").reset_index(drop=True).copy()
    
    #
    num_cols = [item for item in df_train.columns if "totals" in item]
    sum_cols = num_cols + ["fullVisitorId"]
    df_x_agg_sum = df_x[sum_cols].groupby("fullVisitorId",as_index=False).sum().sort_values("fullVisitorId").reset_index(drop=True).copy()
    y_agg = y[["fullVisitorId",'totals_transactionRevenue']].groupby("fullVisitorId",as_index=False).sum().sort_values("fullVisitorId").reset_index(drop=True).copy()
    
    # totals_transactionRevenue
    df_x_agg_sum['totals_transactionRevenue'] = np.log1p(df_x_agg_sum['totals_transactionRevenue'])
    y_agg['totals_transactionRevenue'] = np.log1p(y_agg['totals_transactionRevenue'])
    
    #
    df_x_agg = pd.merge(df_x_agg_sum,df_x_agg_last, how='left',on="fullVisitorId").sort_values("fullVisitorId").reset_index(drop=True).copy()
    return df_x_agg,y_agg

In [None]:
df_train1_agg,label1_agg=group_by_fullVistorsId(df_train1_x,label_1)

In [None]:
df_train2_agg,label2_agg=group_by_fullVistorsId(df_train2_x,label_2)

In [None]:
df_train3_agg,label3_agg=group_by_fullVistorsId(df_train3_x,label_3)
df_train4_agg,label4_agg=group_by_fullVistorsId(df_train4_x,label_4)
df_train5_agg,label5_agg=group_by_fullVistorsId(df_train5_x,label_5)
df_train6_agg,label6_agg=group_by_fullVistorsId(df_train6_x,label_6)

In [None]:
cat_cols = ['channelGrouping', 'fullVisitorId',
       'device.browser', 'device.deviceCategory',
       'device.operatingSystem', 'geoNetwork.continent', 'geoNetwork.country',
       'geoNetwork.networkDomain', 'geoNetwork.subContinent',
       'trafficSource.adContent', 'trafficSource.isTrueDirect',
       'trafficSource.keyword', 'trafficSource.medium',
       'trafficSource.referralPath', 'trafficSource.source',
       'customDimensions']

cont_cols = [
       'totals.bounces', 'totals.hits', 'totals.newVisits', 'totals.pageviews',
       ]

target = ['totals_transactionRevenue']

print('Number of categorical variable columns = '+str(len(cat_cols)))
print('Number of continuous variable columns = '+str(len(cont_cols)))

In [None]:
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
def conv_tr_data(df):
    df_train_cont = df[cont_cols].values
    min_max_scaler = preprocessing.MinMaxScaler()
    df_train_cont_scaled = min_max_scaler.fit_transform(df_train_cont)
    df_train_cont = pd.DataFrame(df_train_cont_scaled, columns = cont_cols)
    return df_train_cont
def conv_te_data(df):
    df_test_cont = df[cont_cols].values
    min_max_scaler = preprocessing.MinMaxScaler()
    df_test_cont_scaled = min_max_scaler.fit_transform(df_test_cont)
    df_test_cont = pd.DataFrame(df_test_cont_scaled, columns = cont_cols)
    return df_test_cont

In [None]:
df_train_cont1=conv_tr_data(df_train1_agg)


In [None]:
#df_train_cont1=conv_tr_data(df_train1_agg)
df_train_cont2=conv_tr_data(df_train2_agg)
df_train_cont3=conv_tr_data(df_train3_agg)
df_train_cont4=conv_tr_data(df_train4_agg)
df_train_cont5=conv_tr_data(df_train5_agg)
df_train_cont6=conv_tr_data(df_train6_agg)

In [None]:
df_test_cont2=conv_te_data(df_train2_agg)
df_test_cont3=conv_te_data(df_train3_agg)
df_test_cont4=conv_te_data(df_train4_agg)
df_test_cont5=conv_te_data(df_train5_agg)
df_test_cont6=conv_te_data(df_train6_agg)

In [None]:
#Define function for label encoding

def data_le(df):
    
    les = []
    les_num_classes = []

    print('nombre de calsse dans dataframe: \n')
    
    for i in range(len(cat_cols)):

        encoder = LabelEncoder()
        encoder.fit(df[cat_cols[i]])
        encoded_column = encoder.transform(df[cat_cols[i]])
        les.append(encoded_column)
        num_classes = np.max(les[i])+1
        les_num_classes.append(num_classes)
    
        print('Colonne '+str(cat_cols[i])+' nombre des classes -->'+str(num_classes))
    
    return les, les_num_classes #retourne un tableau de données codées par étiquette et de classes de nombres pour chaque colonne de données

In [None]:
def train_cat(df_train_agg):
    df_train_cat, les_num_classes_train = data_le(df_train_agg)

    df_train_cat = pd.DataFrame(df_train_cat).astype('int32').transpose() 
    df_train_cat.columns = cat_cols

    les_num_classes_train = pd.DataFrame(les_num_classes_train, columns = ['Num_classes']).astype('int32')
    return df_train_cat,les_num_classes_train

In [None]:
df_train_cat1,les_num_classes_train1=train_cat(df_train1_agg)
df_train_cat2,les_num_classes_train2=train_cat(df_train2_agg)
df_train_cat3,les_num_classes_train3=train_cat(df_train3_agg)
df_train_cat4,les_num_classes_train4=train_cat(df_train4_agg)
df_train_cat5,les_num_classes_train5=train_cat(df_train5_agg)
df_train_cat6,les_num_classes_train6=train_cat(df_train6_agg)

In [None]:
def test_cat(df_train_agg):
    df_test_cat, les_num_classes_test = data_le(df_train_agg)
    df_test_cat = pd.DataFrame(df_test_cat).astype('int32').transpose()
    df_test_cat.columns = cat_cols
    les_num_classes_test = pd.DataFrame(les_num_classes_test, columns = ['Num_classes']).astype('int32')
    return df_test_cat,les_num_classes_test

In [None]:
df_test_cat2,les_num_classes_test2=test_cat(df_train2_agg)
df_test_cat3,les_num_classes_test3=test_cat(df_train3_agg)
df_test_cat4,les_num_classes_test4=test_cat(df_train4_agg)
df_test_cat5,les_num_classes_test5=test_cat(df_train5_agg)
df_test_cat6,les_num_classes_test6=test_cat(df_train6_agg)

In [None]:
    #Prepare input dimension to be used in the embedding layer
def input_dim(les_num_classes_train,les_num_classes_test):
        num_classes = pd.concat([les_num_classes_train, les_num_classes_test], axis = 1)
        input_dimension= np.array(num_classes.max(axis = 1))
        input_dimension
        return input_dimension

In [None]:
input_dimension1=input_dim(les_num_classes_train1,les_num_classes_test2)
input_dimension2=input_dim(les_num_classes_train2,les_num_classes_test3)
input_dimension3=input_dim(les_num_classes_train3,les_num_classes_test4)
input_dimension4=input_dim(les_num_classes_train4,les_num_classes_test5)
input_dimension5=input_dim(les_num_classes_train5,les_num_classes_test6)


In [None]:
#df_train_cat3,les_num_classes_train3=train_cat(df_train3_agg)

In [None]:
def seg_train(df_train_cont,df_train_cat,label_agg):
    X_train_cont = df_train_cont
    X_train_cat = df_train_cat
    y_train = label_agg[target]
    print('Shape of X_train_cont: '+str(X_train_cont.shape))
    print('Shape of X_train_cat: '+str(X_train_cat.shape))
    print('Shape of y_train: '+str(y_train.shape), '\n')
    return X_train_cont,X_train_cat,y_train
#""""""""""""""""""""""""""""""""""""""""""""""
def seg_test(df_test_cont,df_test_cat):
    X_test_cont = df_test_cont
    X_test_cat = df_test_cat
    y_test = label_agg[target]
    print('Shape of X_test_cont: '+str(X_test_cont.shape))
    print('Shape of X_test_cat: '+str(X_test_cat.shape))
    print('Shape of y_test: '+str(y_test.shape), '\n')
    return X_test_cont,X_test_cat,y_test


In [None]:
X_train_cont1,X_train_cat1,y_train1=seg_train(df_train_cont1,df_train_cat1,label1_agg)
X_train_cont2,X_train_cat2,y_train2=seg_train(df_train_cont2,df_train_cat2,label2_agg)
X_train_cont3,X_train_cat3,y_train3=seg_train(df_train_cont3,df_train_cat3,label3_agg)
X_train_cont4,X_train_cat4,y_train4=seg_train(df_train_cont4,df_train_cat4,label4_agg)
X_train_cont5,X_train_cat5,y_train5=seg_train(df_train_cont5,df_train_cat5,label5_agg)
X_train_cont6,X_train_cat6,y_train6=seg_train(df_train_cont6,df_train_cat6,label6_agg)


In [None]:
    #Create the model
from keras import backend
def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
callbacks_list=[]
def archi(input_dimension):
    
        input_cont = Input(shape=(4,)) 

       
        input_cat1 = Input(shape=(1,))
        input_cat2 = Input(shape=(1,))
        input_cat3 = Input(shape=(1,))
        input_cat4 = Input(shape=(1,))
        input_cat5 = Input(shape=(1,))
        input_cat6 = Input(shape=(1,))
        input_cat7 = Input(shape=(1,))
        input_cat8 = Input(shape=(1,))
        input_cat9 = Input(shape=(1,))
        input_cat10 = Input(shape=(1,))
        input_cat11 = Input(shape=(1,))
        input_cat12 = Input(shape=(1,))
        input_cat13 = Input(shape=(1,))
        input_cat14 = Input(shape=(1,))
        input_cat15 = Input(shape=(1,))
        input_cat16 = Input(shape=(1,))

        deep_inputs = [input_cont,
                      input_cat1,
                      input_cat2,
                      input_cat3,
                      input_cat4,
                      input_cat5,
                      input_cat6,
                      input_cat7,
                      input_cat8,
                      input_cat9,
                      input_cat10,
                      input_cat11,
                      input_cat12,
                      input_cat13,
                      input_cat14,
                      input_cat15,
                      input_cat16,]

        #Define embedding layer
        embed1 = Embedding(output_dim = 1, input_dim = input_dimension[0], input_length = 1)(input_cat1)
        embed2 = Embedding(output_dim = 1, input_dim = input_dimension[1], input_length = 1)(input_cat2)
        embed3 = Embedding(output_dim = 1, input_dim = input_dimension[2], input_length = 1)(input_cat3)
        embed4 = Embedding(output_dim = 1, input_dim = input_dimension[3], input_length = 1)(input_cat4)
        embed5 = Embedding(output_dim = 1, input_dim = input_dimension[4], input_length = 1)(input_cat5)
        embed6 = Embedding(output_dim = 1, input_dim = input_dimension[5], input_length = 1)(input_cat6)
        embed7 = Embedding(output_dim = 1, input_dim = input_dimension[6], input_length = 1)(input_cat7)
        embed8 = Embedding(output_dim = 1, input_dim = input_dimension[7], input_length = 1)(input_cat8)
        embed9 = Embedding(output_dim = 1, input_dim = input_dimension[8], input_length = 1)(input_cat9)
        embed10 = Embedding(output_dim = 1, input_dim = input_dimension[9], input_length = 1)(input_cat10)
        embed11 = Embedding(output_dim = 1, input_dim = input_dimension[10], input_length = 1)(input_cat11)
        embed12 = Embedding(output_dim = 1, input_dim = input_dimension[11], input_length = 1)(input_cat12)
        embed13 = Embedding(output_dim = 1, input_dim = input_dimension[12], input_length = 1)(input_cat13)
        embed14 = Embedding(output_dim = 1, input_dim = input_dimension[13], input_length = 1)(input_cat14)
        embed15 = Embedding(output_dim = 1, input_dim = input_dimension[14], input_length = 1)(input_cat15)
        embed16 = Embedding(output_dim = 1, input_dim = input_dimension[15], input_length = 1)(input_cat16)

        
        embed1 = Flatten()(embed1)
        embed2 = Flatten()(embed2)
        embed3 = Flatten()(embed3)
        embed4 = Flatten()(embed4)
        embed5 = Flatten()(embed5)
        embed6 = Flatten()(embed6)
        embed7 = Flatten()(embed7)
        embed8 = Flatten()(embed8)
        embed9 = Flatten()(embed9)
        embed10 = Flatten()(embed10)
        embed11 = Flatten()(embed11)
        embed12 = Flatten()(embed12)
        embed13 = Flatten()(embed13)
        embed14 = Flatten()(embed14)
        embed15 = Flatten()(embed15)
        embed16 = Flatten()(embed16)

        input_cont = Dense(500, activation = 'relu')(input_cont)
        input_cont = Dense(1, activation = 'linear')(input_cont)

        #Merge embedded layer and input_cont
        merged_layer = concatenate([input_cont,
                                   embed1,
                                   embed2,
                                   embed3,
                                   embed4,
                                   embed5,
                                   embed6,
                                   embed7,
                                   embed8,
                                   embed9,
                                   embed10,
                                   embed11,
                                   embed12,
                                   embed13,
                                   embed14,
                                   embed15,
                                   embed16])

        #Define hidden layer
        hidden1 = Dense(1000, activation = 'relu')(merged_layer)
        hidden1 = Dropout(0.2)(hidden1)
        hidden2 = Dense(500, activation = 'relu')(hidden1)

        #Define output layer
        output = Dense(1, activation = 'linear')(hidden2)

        #Define model
        nnembedding_model = Model(inputs = deep_inputs, outputs = output)

        #Compile model
        nnembedding_model.compile(loss='mse',
                               optimizer='adam',
                               metrics=[rmse])

        # Checkpoint
        filepath="weights.best.hdf5"
        checkpoint = ModelCheckpoint(filepath, monitor='val_rmse', verbose=1, save_best_only=True, mode='min')
        earlystopper = EarlyStopping(monitor='val_rmse', verbose=1)
        callbacks_list = [checkpoint]
        return nnembedding_model

In [None]:
nnembedding_model=archi(input_dimension1)

In [None]:
nnembedding_model.summary()

In [None]:
   
def model_(nnembedding_model,X_train_cont,X_train_cat,y_train,X_val_cont,X_val_cat,y_val):
           history = nnembedding_model.fit([X_train_cont,
                                            X_train_cat['channelGrouping'],
                                            X_train_cat['fullVisitorId'],
                                            X_train_cat['device.browser'],
                                            X_train_cat['device.deviceCategory'],
                                            X_train_cat['device.operatingSystem'],
                                            X_train_cat['geoNetwork.continent'],
                                            X_train_cat['geoNetwork.country'],
                                            X_train_cat['geoNetwork.networkDomain'],
                                            X_train_cat['geoNetwork.subContinent'],
                                            X_train_cat['trafficSource.adContent'],
                                            X_train_cat['trafficSource.isTrueDirect'],
                                            X_train_cat['trafficSource.keyword'],
                                            X_train_cat['trafficSource.medium'],
                                            X_train_cat['trafficSource.referralPath'],
                                            X_train_cat['trafficSource.source'],
                                            X_train_cat['customDimensions']],
                                            y_train,
                                            epochs=10, batch_size=128, verbose = 1, callbacks=callbacks_list,
                                            validation_data = ([X_val_cont,
                                            X_val_cat['channelGrouping'],
                                            X_val_cat['fullVisitorId'],
                                            X_val_cat['device.browser'],
                                            X_val_cat['device.deviceCategory'],
                                            X_val_cat['device.operatingSystem'],
                                            X_val_cat['geoNetwork.continent'],
                                            X_val_cat['geoNetwork.country'],
                                            X_val_cat['geoNetwork.networkDomain'],
                                            X_val_cat['geoNetwork.subContinent'],
                                            X_val_cat['trafficSource.adContent'],
                                            X_val_cat['trafficSource.isTrueDirect'],
                                            X_val_cat['trafficSource.keyword'],
                                            X_val_cat['trafficSource.medium'],
                                            X_val_cat['trafficSource.referralPath'],
                                            X_val_cat['trafficSource.source'],
                                            X_val_cat['customDimensions']],
                                                               y_val))
           return history

In [None]:
nnembedding_model=archi(input_dimension1)
history1=model_(nnembedding_model,X_train_cont1,X_train_cat1,y_train1,X_train_cont2,X_train_cat2,y_train2)
pyplot.plot(history1.history['val_rmse'])
pyplot.show()

In [None]:
min(history1.history['val_rmse'])

In [None]:
nnembedding_model=archi(input_dimension2)
history2=model_(nnembedding_model,X_train_cont2,X_train_cat2,y_train2,X_train_cont3,X_train_cat3,y_train3)
pyplot.plot(history2.history['val_rmse'])
pyplot.show()

In [None]:
min(history2.history['val_rmse'])

In [None]:
nnembedding_model=archi(input_dimension3)
history3=model_(nnembedding_model,X_train_cont3,X_train_cat3,y_train3,X_train_cont4,X_train_cat4,y_train4)
pyplot.plot(history3.history['val_rmse'])
pyplot.show()

In [None]:
min(history3.history['val_rmse'])

In [None]:
nnembedding_model=archi(input_dimension4)
history4=model_(nnembedding_model,X_train_cont4,X_train_cat4,y_train4,X_train_cont5,X_train_cat5,y_train5)
pyplot.plot(history4.history['val_rmse'])
pyplot.show()

In [None]:
min(history4.history['val_rmse'])

In [None]:
nnembedding_model=archi(input_dimension5)
history5=model_(nnembedding_model,X_train_cont5,X_train_cat5,y_train5,X_train_cont6,X_train_cat6,y_train6)
pyplot.plot(history5.history['val_rmse'])
pyplot.show()

In [None]:
min(history5.history['val_rmse'])