Predicting new observations using pytorch model
==========

In [52]:
from torch import nn, optim
from torch.autograd import Variable
import torch
import torch.nn.functional as F
import torch.utils.data
import torch.optim as optim

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import os

## clear GPU memory to optimize capacity
torch.cuda.empty_cache()

os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\food-expo-attendee-prediction-project')

##### Pre-process the data to match the requirement of the model

In [53]:
# encoding = latin-1 was used here due to the characters that are unreadable when using the standard utf-8
data = pd.read_csv(r'.\data\gfd2019.csv', encoding='latin-1')

# add show columns
data['show'] = 'gfd'

  interactivity=interactivity, compiler=compiler, result=result)


In [54]:
data['Date Created']

0             1/19/2019
1             1/19/2019
2             1/19/2019
3        2/11/2019 0:00
4        2/11/2019 0:00
5        2/11/2019 0:00
6             2/19/2019
7             1/27/2019
8             1/27/2019
9        1/11/2019 0:00
10       1/11/2019 0:00
11       1/11/2019 0:00
12       1/11/2019 0:00
13       1/11/2019 0:00
14       1/11/2019 0:00
15       1/11/2019 0:00
16       1/11/2019 0:00
17       1/11/2019 0:00
18       1/11/2019 0:00
19       1/11/2019 0:00
20       1/11/2019 0:00
21       1/11/2019 0:00
22       1/11/2019 0:00
23       1/11/2019 0:00
24       1/11/2019 0:00
25       1/11/2019 0:00
26       1/11/2019 0:00
27       1/11/2019 0:00
28       1/11/2019 0:00
29       1/11/2019 0:00
              ...      
70014         1/21/2019
70015         1/21/2019
70016         1/21/2019
70017         1/21/2019
70018         1/21/2019
70019     2/8/2019 0:00
70020     2/8/2019 0:00
70021     1/9/2019 0:00
70022    2/11/2019 0:00
70023         1/18/2019
70024         2/

In [55]:
pd.to_datetime(data['Date Created'])

0       2019-01-19
1       2019-01-19
2       2019-01-19
3       2019-02-11
4       2019-02-11
5       2019-02-11
6       2019-02-19
7       2019-01-27
8       2019-01-27
9       2019-01-11
10      2019-01-11
11      2019-01-11
12      2019-01-11
13      2019-01-11
14      2019-01-11
15      2019-01-11
16      2019-01-11
17      2019-01-11
18      2019-01-11
19      2019-01-11
20      2019-01-11
21      2019-01-11
22      2019-01-11
23      2019-01-11
24      2019-01-11
25      2019-01-11
26      2019-01-11
27      2019-01-11
28      2019-01-11
29      2019-01-11
           ...    
70014   2019-01-21
70015   2019-01-21
70016   2019-01-21
70017   2019-01-21
70018   2019-01-21
70019   2019-02-08
70020   2019-02-08
70021   2019-01-09
70022   2019-02-11
70023   2019-01-18
70024   2019-02-19
70025   2019-02-06
70026   2019-01-08
70027   2019-01-08
70028   2019-01-03
70029   2019-02-06
70030   2019-01-22
70031   2019-02-05
70032   2019-02-05
70033   2019-02-08
70034   2019-02-16
70035   2019

In [56]:
## merge and drop columns
def pre_process(data, merge_columns, show_date): # merge_columns and drop_columns are relative to the show
    data['10 Digit Card Number'] = pd.to_numeric(data['10 Digit Card Number'])
    data['merged'] = data[merge_columns].apply(lambda x: ']' + x.astype(str), axis=1).apply(lambda x: r''.join(x.astype(str)), axis=1).str.replace(r' ', r'')
    data = data[['10 Digit Card Number', 'State', 'Country', 'Date Created', 'Email',
       'Website', 'show', 'merged']]
    data['show_date'] = pd.to_datetime(show_date)
    return data

# merge and drop for GFD
merge_columns = ['CATEGORY',
       'Company Industry*', 'Job Function*', 'Company\'s Main Activity',
       'Company\'s size (No. of employees)*', 'Job/Purchasing Role*',
       'I am interested in the following food/beverage products',
       'I am interested in the following food/beverage products - BAKERY',
       'I am interested in the following food/beverage products - BEVERAGES',
       'I am interested in the following food/beverage products - CHILLED & FRESH FOOD\r\n',
       'I am interested in the following food/beverage products - CONFECTIONARY\r\n',
       'I am interested in the following food/beverage products - DAIRY\r\n',
       'I am interested in the following food/beverage products - FROZEN FOOD',
       'I am interested in the following food/beverage products - GRAINS / CEREALS / FLOURS\r\n',
       'I am interested in the following food/beverage products - HEALTH, WELLNESS & FREE-FROM PRODUCTS\r\n',
       'I am interested in the following food/beverage products - MEAT & POULTRY\r\n',
       'I am interested in the following food/beverage products - SEAFOOD\r\n',
       'I am interested in the following food/beverage products - SNACKS ',
       'I am interested in the following food/beverage products - SPECIALITY FOOD',
       'I am interested in the following food/beverage products - FEDERAL GOVERNMENT',
       'I am interested in the following food/beverage products - LOCAL GOVERNMENT\r\n',
       'I am interested in the following food/beverage products - NON-PROFIT\r\n',
       'I am interested in the following food/beverage products - PROFESSIONAL SERVICES\r\n',
       'I am interested to attend the Gulfood Innovation Summit? ',
       'I want to receive a free ticket to attend PRIME - Private Label & Licensing Middle East (29-31 OCT 2019 | Dubai)',
       'Exhibitor I-Invite Product Sector', 'Gender', 'Terms & Conditions',
       'REGISTRATION TYPE', 'NATIONALITY - DWTC', 'Country of Company',
       'Language Code', 'UPLOAD CODES', 'Email Broadcast', 'PROMO CODE',
       'DWTC Booking Platform', 'Exhibition Packages', 'Payment Stages',
       'PAYMENT CHOICE', 'PAYMENT STATUS', 'PAYMENT MODE', 'PAYMENT DETAILS ',
       'UTM Values', 'DATABASE MANAGEMENT', 'DROP OFF MANAGEMENT',
       'DTCM - DEFAULT CODES', 'DTCM Basket Codes', 'Exhibitor Platform',
       'API Codes', 'AGE GROUP - Exhibitor',
       'Attendance Status (Telemarketing Codes)',
       'Reason for not Attending (Telemarketing Codes)',
       'Registration status (Telemarketing Codes)',
       'Actual Registration status (Telemarketing Codes) ***Automated - Please don\'t update!!!',
       'Exhibitor I-Invite (Code)', 'Recommend a Colleague Codes',
       'MAJLIS Management', 'MAJLIS Nominating Company']

data = pre_process(data, merge_columns=merge_columns, show_date='2019-02-17')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [57]:
'''def replace_codes(data, show):
    data['merged'] = r']'+data['merged']
    data['merged'] = data['merged'].str.replace(r']', r']' + show)
    return data
data = replace_codes(data, show='GFD19')'''

def with_website(data):
    if 'Website' in data.columns.values:
        data.loc[data['Website']==" ", 'with_website'] = 0
        data.loc[data['Website']!=" ", 'with_website'] = 1
        return data
    else:
        print('There is no website column')

def with_email(data):
    if 'Email' in data.columns.values:
        data.loc[data['Email']==" ", 'with_email'] = 0
        data.loc[data['Email']!=" ", 'with_email'] = 1
        return data
    else:
        print('There is no Email column')

data = with_website(data)
#data = with_email(data)
data = data.drop(['Email', 'Website'], axis=1)

In [58]:
def days_to_go_reg(data):
    if 'Date Created' in data.columns.values:
        data['Date Created'] = pd.to_datetime(data['Date Created'])
        difference = data['show_date'] - data['Date Created']
        return difference
    else:
        print('There is no Date Created column')

difference = days_to_go_reg(data)
data['days_to_go'] = difference.dt.days
data['weeks_to_go'] = round(data['days_to_go']/7)
data = data.drop(['Date Created', 'show_date'], axis=1)

In [59]:
# Load regions data
region = pd.read_excel(r'.\data\region.xlsx')

In [60]:
def cleanup_country(data, region):
    if 'Country' in data.columns.values:
        data.loc[pd.isnull(data['Country']), 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        data.loc[data['Country']=='', 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        data.loc[data['Country']==' ', 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        
        data = data.merge(region, left_on = 'Country', right_on = 'country', how = 'left')
        no_region = data.loc[pd.isnull(data['region_2'])]
        data = data.drop('Country', axis=1)
        return data, no_region
    else:
        print('There is no Country column')

data, no_region = cleanup_country(data, region)

In [61]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

distance = pd.Series([])
for i in range(len(data.index)):
    lon1 = 53.847818
    lat1 = 23.424076
    lon2 = data.loc[i,['longitude']]
    lat2 = data.loc[i,['latitude']]
    dist = pd.Series(haversine(lon1, lat1, lon2, lat2))
    distance = distance.append(dist, ignore_index=True)
    
data['distance'] = distance

In [62]:
def create_state_group(data):
    if 'State' in data.columns.values:
        data['State'] = data['State'].str.lower().str.replace(r' ', r'_')
        data.loc[data['country']!= 'United Arab Emirates', 'State'] = 'international_state'
        data.loc[data['State']== r' ', 'State'] = 'dubai'
        data.loc[data['State']== r'_', 'State'] = 'dubai'
        data.loc[data['State']== r'', 'State'] = 'dubai'
        return data
    else:
        print('There is no Country column')
data_1 = create_state_group(data)

In [63]:
# Load the codes data
codes = pd.read_excel(r'.\data\codes.xlsx')

In [64]:
def dummify_responses(data, codes):
    columns_to_check = {'merged', '10 Digit Card Number'}
    for cols in columns_to_check:
        if cols not in data.columns.values:
            print('There is no ',cols,' column')
            break
    else:
        responses = data['merged'].str.split(r']', expand=True)
        responses['10 Digit Card Number'] = data['10 Digit Card Number']
        responses = responses.melt(id_vars=['10 Digit Card Number'], value_name = 'code')
        responses['value'] = 1
        
        responses = responses.merge(codes, left_on = 'code', right_on = 'code', how = 'left')
        responses = responses.loc[responses['included'] == 'Y']
        responses = responses.drop(['show', 'question', 'code', 'text_answer', 'included'], axis=1)
        responses = responses.pivot_table(index = '10 Digit Card Number', columns = 'decode', values = 'value', aggfunc = 'max')
        #responses.loc[responses['Attended']!=1, 'Attended'] = 0
        return responses

responses = dummify_responses(data_1, codes)
data_1 = data_1.merge(responses, on='10 Digit Card Number', how = 'left').drop('merged', axis=1)

In [65]:
def dummify_columns(data, columns):
    columns_to_check = {'10 Digit Card Number'}
    for cols in columns_to_check:
        if cols not in data.columns.values:
            print('There is no ',cols,' column')
            break
    data_1 = data[columns]
    data_1 = pd.get_dummies(data_1[columns])
    data_1['10 Digit Card Number'] = data['10 Digit Card Number']
    data = data.merge(data_1, on='10 Digit Card Number', how = 'left')
    data = data.drop(columns, axis=1)
    return data

#data = data.drop('country', axis=True)
columns = ['State', 'country', 'region_1', 'region_2']
data_1 = dummify_columns(data_1, columns)
data_1 = data_1.fillna(0)

In [66]:
## count registrants per company
gfd = pd.read_csv(r'.\data\gfd2019.csv', encoding='latin-1')
gfd = gfd[['10 Digit Card Number', 'Company Name', 'Country', 'Website']]

def company_reg_counts(data):
    data['country_company'] = data[['Company Name', 'Country']].apply(lambda x: r''.join(x.astype(str)), axis=1).str.lower().str.replace(r' ', r'')
    count_per_company = data[data['Company Name']!=' '].country_company.value_counts().rename_axis('x').reset_index(name='count_per_company')
    data['country_website'] = data[['Website', 'Country']].apply(lambda x: r''.join(x.astype(str)), axis=1).str.lower().str.replace(r' ', r'')
    count_per_comp_website = data[data['Website']!=' '].country_company.value_counts().rename_axis('y').reset_index(name='count_per_comp_website')
    count_per_website = data[data['Website']!=' '].Website.value_counts().rename_axis('z').reset_index(name='count_per_website')
    data = data.merge(count_per_company, left_on = 'country_company', right_on = 'x', how = 'left')
    data = data.merge(count_per_comp_website, left_on = 'country_website', right_on = 'y', how = 'left')
    data = data.merge(count_per_website, left_on = 'Website', right_on = 'z', how = 'left')
    data = data[['10 Digit Card Number', 'count_per_company', 'count_per_comp_website', 'count_per_website']].fillna(1)
    return data

gfd_count = company_reg_counts(gfd)

data_1 = data_1.merge(gfd_count, on='10 Digit Card Number', how='left')
ids = data_1['10 Digit Card Number']

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
'''def preprocess_data(data):
    data = data.drop('show', axis=True)
    summary = data.describe().transpose()
    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]
    cols = cols.columns[1:]

    # First, scale the Data - only those numerical/non-categorical
    names = data.columns
    scaler = preprocessing.StandardScaler()
    # Fit your data on the scaler object
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=names)
    scaled_data = scaled_data[cols] ###------------------->> cols are non-categorical columns
    data = data.drop(cols, axis=1)
    data = pd.concat([scaled_data, data], axis=1)
    return data
data = preprocess_data(data)'''

"def preprocess_data(data):\n    data = data.drop('show', axis=True)\n    summary = data.describe().transpose()\n    cols = data[summary[summary['max']>1].reset_index()['index'].tolist()]\n    cols = cols.columns[1:]\n\n    # First, scale the Data - only those numerical/non-categorical\n    names = data.columns\n    scaler = preprocessing.StandardScaler()\n    # Fit your data on the scaler object\n    scaled_data = scaler.fit_transform(data)\n    scaled_data = pd.DataFrame(scaled_data, columns=names)\n    scaled_data = scaled_data[cols] ###------------------->> cols are non-categorical columns\n    data = data.drop(cols, axis=1)\n    data = pd.concat([scaled_data, data], axis=1)\n    return data\ndata = preprocess_data(data)"

In [68]:
def match_columns(data):
    columns_1 = pd.read_pickle(r'.\data\output\columns_used_for_model.pkl')
    columns_1 = np.array(columns_1['cols'])
    columns_2 = data.columns
    newcols = list(set(columns_1) - set(columns_2))
    for cols in newcols:
        data[cols] = 0
    delCols = list(set(columns_2) - set(columns_1))
    data = data.drop(delCols, axis=1)
    return data
data_1 = match_columns(data_1)
data_1['10 Digit Card Number'] = ids

In [69]:
def scale_data(data):
    cols = pd.read_csv(r'.\data\output\mean_std_scaler.csv')
    for i in range(len(cols)):
        column = cols['feature'][i]
        mean = cols['mean'][i]
        std = cols['std'][i]
        data[column] = (data[column]-mean)/std
    return data

data_1 = scale_data(data_1)

In [70]:
data_1.to_pickle(r'.\data\output\dataForPreds.pkl')

In [71]:
data_1.columns

Index(['show', 'with_website', 'days_to_go', 'weeks_to_go', 'latitude',
       'longitude', 'distance', 'arabic_page', 'bakery', 'bar_cafe_restaurant',
       ...
       'country_Svalbard And Jan Mayen Islands', 'country_Taiwan', 'exhibitor',
       'country_British Indian Ocean Territory', 'country_Guam', 'female',
       'country_Tonga', 'male', 'warehousing_distribution',
       '10 Digit Card Number'],
      dtype='object', length=278)

In [72]:
a = data_1.describe().stack()['max']
a = pd.DataFrame(a)
a.to_csv(r'.\data\output\a.csv')

In [73]:
data_1['days_to_go']

0        29
1        29
2        29
3         6
4         6
5         6
6        -2
7        21
8        21
9        37
10       37
11       37
12       37
13       37
14       37
15       37
16       37
17       37
18       37
19       37
20       37
21       37
22       37
23       37
24       37
25       37
26       37
27       37
28       37
29       37
         ..
70122    27
70123    27
70124    27
70125    27
70126    27
70127     9
70128     9
70129    39
70130     6
70131    30
70132    -2
70133    11
70134    40
70135    40
70136    45
70137    11
70138    26
70139    12
70140    12
70141     9
70142     1
70143    14
70144    52
70145    52
70146    10
70147    10
70148    -2
70149    37
70150    37
70151    37
Name: days_to_go, Length: 70152, dtype: int64