Data preparation process for events show
==================
**Prepared by** : Grej - Mar 11, 2019

###### Overview: 
It is filtered to pre-registrants only.  
This means all uploaded files from the organizers were not included.


In [339]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import geopy.distance
import math
import gc

In [340]:
# This will check the number of registrations per company, this will be used as a new feature.

# count the number of registrants per company - categories are combined company name + country, country + website and website only

def company_reg_counts(data, data_type):
    os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')
    data_path = data_type+'_raw_data.csv'
    # encoding = latin-1 was used here due to the characters that are unreadable when using the standard utf-8
    data = pd.read_csv(r'.\\data\\'+data_path, encoding='latin-1')
    data = data[['10 Digit Card Number', 'Company Name', 'Country', 'Website']]
    data['country_company'] = data[['Company Name', 'Country']].apply(lambda x: r''.join(x.astype(str)), axis=1).str.lower().str.replace(r' ', r'')
    count_per_company = data[data['Company Name']!=' '].country_company.value_counts().rename_axis('x').reset_index(name='count_per_company')
    data['country_website'] = data[['Website', 'Country']].apply(lambda x: r''.join(x.astype(str)), axis=1).str.lower().str.replace(r' ', r'')
    count_per_comp_website = data[data['Website']!=' '].country_company.value_counts().rename_axis('y').reset_index(name='count_per_comp_website')
    count_per_website = data[data['Website']!=' '].Website.value_counts().rename_axis('z').reset_index(name='count_per_website')
    
    data = data.merge(count_per_company, left_on = 'country_company', right_on = 'x', how = 'left')
    data = data.merge(count_per_comp_website, left_on = 'country_website', right_on = 'y', how = 'left')
    data = data.merge(count_per_website, left_on = 'Website', right_on = 'z', how = 'left')
    company_reg_counts_data = data[['10 Digit Card Number', 'count_per_company', 'count_per_comp_website', 'count_per_website']].fillna(1)
    company_reg_counts_data.to_pickle(r'.\data\output\company_reg_count.pkl')
    return company_reg_counts_data

In [341]:
def load_data(data_type):
    os.chdir(r'C:\Users\User\Documents\Data_Science_Projects\middle-east-event-show-prediction-project')
    data_path = data_type+'_raw_data.csv'
    # encoding = latin-1 was used here due to the characters that are unreadable when using the standard utf-8
    data = pd.read_csv(r'.\\data\\'+data_path, encoding='latin-1')

    # add show columns
    data['show'] = 'mese'
    data['show_date'] = pd.to_datetime('4/22/2018')

    data = data.rename(columns={'Responses':'merged'})
    return data

In [342]:
'''def append_all_files(files):
    
    adestra = pd.DataFrame()
    for file in files:
        path = str('.\\data\\adestra\\' + file)
        data = pd.read_csv(path)
        adestra = adestra.append(data, ignore_index=True)
    adestra['opened_email_broadcast'] = 1
    adestra = adestra[['email', 'opened_email_broadcast']].rename(columns={'email': 'Email'})
    count_number_email_opened = adestra.Email.value_counts().rename_axis('Email').reset_index(name='count_number_email_opened')
    adestra = adestra.merge(count_number_email_opened, on='Email', how='left')
    adestra = adestra.drop_duplicates(subset=['Email', 'opened_email_broadcast'], keep='first').reset_index().drop('index', axis=True)
    adestra.to_pickle(r'.\data\output\adestra_data.pkl')
    return adestra'''

"def append_all_files(files):\n    \n    adestra = pd.DataFrame()\n    for file in files:\n        path = str('.\\data\\adestra\\' + file)\n        data = pd.read_csv(path)\n        adestra = adestra.append(data, ignore_index=True)\n    adestra['opened_email_broadcast'] = 1\n    adestra = adestra[['email', 'opened_email_broadcast']].rename(columns={'email': 'Email'})\n    count_number_email_opened = adestra.Email.value_counts().rename_axis('Email').reset_index(name='count_number_email_opened')\n    adestra = adestra.merge(count_number_email_opened, on='Email', how='left')\n    adestra = adestra.drop_duplicates(subset=['Email', 'opened_email_broadcast'], keep='first').reset_index().drop('index', axis=True)\n    adestra.to_pickle(r'.\\data\\output\x07destra_data.pkl')\n    return adestra"

##### Dummify all responses

In [343]:
def dummify_responses(data, codes):
    columns_to_check = {'merged', '10 Digit Card Number'}
    for cols in columns_to_check:
        if cols not in data.columns.values:
            print('There is no ',cols,' column')
            break
    else:
        responses = data['merged'].str.split(r']', expand=True)
        print('checkpoint 1 -- '+ str(len(responses)))
        
        responses['10 Digit Card Number'] = data['10 Digit Card Number']
        print('checkpoint 2 -- '+ str(len(responses)))
        
        responses = responses.melt(id_vars=['10 Digit Card Number'], value_name = 'code')
        responses['value'] = 1
        print('checkpoint 3 -- '+ str(len(responses)))
        
        responses = responses.merge(codes, left_on = 'code', right_on = 'code', how = 'left')
        print('checkpoint 4 -- '+ str(len(responses)))
        print(responses.columns)
        print(responses['included'].value_counts())
        
        responses = responses.loc[responses['included'] == 'YES']
        print('checkpoint 4.1 -- '+ str(len(responses)))
        responses = responses.drop(['show', 'question', 'code', 'code_2', 'text_answer', 'included', 'job_rank'], axis=1)
        print('checkpoint 5 -- '+ str(len(responses)))
        
        responses = responses.pivot_table(index = '10 Digit Card Number', columns = 'decode', values = 'value', aggfunc = 'max')
        print('checkpoint 6 -- '+ str(len(responses)))

        #responses.loc[responses['Attended']!=1, 'Attended'] = 0
        return responses


Feature Engineering
======================

##### Add email and website features
This part is a feature engineering process.  
The logic behind is that registrants that has entered websites and emails might have correlation to those who attend.  
One reason might be because those who have websites and emails are more interested and their company's are active in the industry.

The distance of the country from UAE might be also a factor.  
The hypothesis is those who come from farther places are less likely to attend.

In [344]:
# calculate distance of country from UAE.

def haversine(lon1, lat1, lon2, lat2):
    from math import radians, cos, sin, asin, sqrt
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r


In [345]:
def with_website(data):
    if 'Website' in data.columns.values:
        data.loc[data['Website']==" ", 'with_website'] = 0
        data.loc[data['Website']!=" ", 'with_website'] = 1
        return data
    else:
        print('There is no website column')

'''def with_email(data):
    if 'Email' in data.columns.values:
        adestra = pd.read_pickle(r'.\data\output\adestra_data.pkl')
        data = data.merge(adestra, on='Email', how='left')
        data.loc[data['Email']==" ", 'with_email'] = 0
        data.loc[data['Email']!=" ", 'with_email'] = 1
        return data
    else:
        print('There is no Email column')
'''

'def with_email(data):\n    if \'Email\' in data.columns.values:\n        adestra = pd.read_pickle(r\'.\\data\\output\x07destra_data.pkl\')\n        data = data.merge(adestra, on=\'Email\', how=\'left\')\n        data.loc[data[\'Email\']==" ", \'with_email\'] = 0\n        data.loc[data[\'Email\']!=" ", \'with_email\'] = 1\n        return data\n    else:\n        print(\'There is no Email column\')\n'

##### Add days_to_go and weeks_to_go feature
The hypothesis is those who register close to the date of the show are more likely to attend.

In [346]:
def days_to_go_reg(data):
    if 'Date Created' in data.columns.values:
        data['Date Created'] = pd.to_datetime(data['Date Created'])
        difference = data['show_date'] - data['Date Created']
        return difference
    else:
        print('There is no Date Created column')


##### Create groupings for UAE States

In [347]:
def create_state_group(data):
    if 'State' in data.columns.values:
        data['State'] = data['State'].str.lower().str.replace(r' ', r'_')
        data.loc[data['country']!= 'United Arab Emirates', 'State'] = 'international_state'
        data.loc[data['State']== r' ', 'State'] = 'dubai'
        data.loc[data['State']== r'_', 'State'] = 'dubai'
        print('\n State Summary\n')
        print(data['State'].value_counts())
        return data
    else:
        print('There is no State column')

##### Cleanup the country and group into regions, add the distance of countries from UAE

In [348]:
def cleanup_country(data, region):
    if 'Country' in data.columns.values:
        data.loc[pd.isnull(data['Country']), 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        data.loc[data['Country']=='', 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        data.loc[data['Country']==' ', 'Country'] = 'United Arab Emirates' # replace blank countries with UAE
        
        data = data.merge(region, left_on = 'Country', right_on = 'country', how = 'left')
        no_region = data.loc[pd.isnull(data['region_2'])]
        print('\n Region Summary\n')
        print(data['region_2'].value_counts())
        print('\nNumber of No Region -- '+ str(len(no_region)))
        return data, no_region
    else:
        print('There is no Country column')


In [349]:
def mean_encode(data, columns):
    for cols in columns:
        data['mean_encode_'+cols] = data[cols].map(data.groupby(cols)['attended'].mean())
        print('\nMean Encoder Summary -- '+cols)
        print(data['mean_encode_'+cols].describe())
    return data

##### Dummify all other categorical variables not included in responses

In [350]:
def dummify_columns(data, columns):
    columns_to_check = {'10 Digit Card Number'}
    for cols in columns_to_check:
        if cols not in data.columns.values:
            print('There is no ',cols,' column')
            break
    data_1 = data[columns]
    data_1 = pd.get_dummies(data_1[columns])
    data_1['10 Digit Card Number'] = data['10 Digit Card Number']
    data = data.merge(data_1, on='10 Digit Card Number', how = 'left')
    data = data.drop(columns, axis=1)
    return data



In [351]:
if __name__ == '__main__':
    # type of data to be processed
    data_type = 'train'
    

    '''# open adestra files
    files = os.listdir(r'.\data\adestra')
    adestra = append_all_files(files)'''
    
    # load data
    data = load_data(data_type)
    
    company_reg_counts_data = company_reg_counts(data, data_type)
    
    data = data.merge(company_reg_counts_data,  on='10 Digit Card Number', how = 'left')
    data['company_name'] = data[['Company Name']].apply(lambda x: r''.join(x.astype(str)), axis=1).str.lower().str.replace(r' ', r'')
    
    print('data length is '+ str(len(data)))
    # Load the codes data and one-hot-code all responses
    codes = pd.read_excel(r'.\data\codes.xlsx')
    print(codes)
    responses = dummify_responses(data, codes)
    data = data.merge(responses, on='10 Digit Card Number', how = 'left').drop('merged', axis=1)
    data = data.fillna(0)

    # add with_website, with_email, days_to_go, weeks_to_go feature
    data = with_website(data)
    ##data = with_email(data)
    data = data.drop(['Email', 'Website'], axis=1)
    difference = days_to_go_reg(data)
    data['days_to_go'] = difference.dt.days
    data['weeks_to_go'] = round(data['days_to_go']/7)
    data = data.drop(['Date Created', 'show_date'], axis=1)

 
    # Load regions data and cleanup the country
    region = pd.read_excel(r'.\data\region.xlsx')
    data, no_region = cleanup_country(data, region)
        
    # create the State binning feature
    data = create_state_group(data)

    # create mean encoded features for ['State', 'region_1', 'region_2']
    
    data = mean_encode(data, columns = ['State', 'country', 'region_1', 'region_2', 'days_to_go'])
  
    columns = ['State', 'region_1', 'region_2']
    data = dummify_columns(data, columns)
    data = data.fillna(0)
    
    # add distance feature using latitude and longitude
    distance = pd.Series([])
    for i in range(len(data.index)):
        lon1 = 53.847818
        lat1 = 23.424076
        lon2 = data.loc[i,['longitude']]
        lat2 = data.loc[i,['latitude']]
        dist = pd.Series(haversine(lon1, lat1, lon2, lat2))
        distance = distance.append(dist, ignore_index=True)
    data['distance'] = distance
    
    # below are situation-based wrangling of data - 
    # removed and filtered some features that are not used anymore
    
    data = data.drop(['country', 'latitude', 'longitude', 'with_website', 'show', 'Country', 'company_name'], axis=True)
    data = data.drop('Company Name', axis=True)
    data = data[data['days_to_go']>=1].reset_index().drop('index', axis=1)
    
    data = data.rename(columns={'attended': 'target', '10 Digit Card Number':'card_number'})
    

    # saving the data as cleanData
    data.to_csv(r'.\data\output\train_clean_data.csv', index=False)
    
    # identify the features - important to have the same feature for the test Data
    columns = pd.DataFrame({'cols' : data.columns})
    columns = columns[1:]
    columns = columns[columns['cols']!='target']
    columns.to_csv(r'.\data\output\columns_used_for_model.csv', index=False)
    print('\nNumber of Features -- '+ str(len(columns)))
    
    print('\nDone...')

data length is 4860
       show                                     question         code code_2  \
0    mese18                                     CATEGORY    mese18VIS    VIS   
1    mese18                                     CATEGORY    mese18EXH    EXH   
2    mese18                                     CATEGORY    mese18SPK    SPK   
3    mese18                                     CATEGORY    mese18MED    MED   
4    mese18                                     CATEGORY    mese18VIP    VIP   
5    mese18                                     CATEGORY    mese18SPO    SPO   
6    mese18                                     CATEGORY    mese18ORG    ORG   
7    mese18                         Nature of Business\t     mese1801     01   
8    mese18                         Nature of Business\t     mese1802     02   
9    mese18                         Nature of Business\t     mese1803     03   
10   mese18                         Nature of Business\t     mese1804     04   
11   mese18         




 Region Summary

ME GCC            4568
Europe              93
Australia-Asia      66
Africa              62
Americas            44
ME Non-GCC          26
Name: region_2, dtype: int64

Number of No Region -- 1

 State Summary

dubai                  3766
international_state     440
abu_dhabi               404
sharjah                 165
ras_al_khaimah           40
fujairah                 24
ajman                    21
Name: State, dtype: int64


KeyError: 'Column not found: attended'