# Akeed Restaurant Recommendation Challenge

## Importing libraries:

In [None]:
import time
import datetime
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import psycopg2
import geopandas as gpd
from shapely import wkt
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from shapely.geometry import Point
from copy import deepcopy, copy
import multiprocessing as mp
from pathos.multiprocessing import ProcessingPool as Pool

## Define functions

In [None]:
def currentSecondsTime():
    """ Returns the current time in seconds"""
    return int(time.time())


def timeTaken(startTime, endTime):
    """ Returns the difference between a start time and an end time
        formatted as 00:00:00 """
    timeTaken = endTime - startTime
    return str(datetime.timedelta(seconds=timeTaken))

def showPyMessage(message, messageType="Message"):
    """ Shows a formatted message to the user during processing. """
    if (messageType == "Message"):
        os.system('echo ' + str(time.ctime()) + " - " + message + "'")
        print(message)
    if (messageType == "Warning"):
        os.system('echo ' + str(time.ctime()) + " - " + message + "'")
        print(message)
    if (messageType == "Error"):
        os.system('echo ' + str(time.ctime()) + " - " + message + "'")
        print(message)

## Inspecting the base layers:

### Reference Layers

In [None]:
#Postgis Layer
#con = psycopg2.connect(host="localhost", database="vulcan", port="6666", user="tebogo", password="tebogo")
#countries_sql = "SELECT * FROM public.countries WHERE cntry_name = 'Oman'"
#countries_gdf = gpd.GeoDataFrame.from_postgis(countries_sql, con, geom_col='geom')

### Base Layer 1

In [None]:
#Training Locations
training_locations = pd.read_csv("./train_locations.csv")
print("\n This table has " + str(len(training_locations)) + " rows.")
training_locations.head()
#training_locations.isnull().sum()

### Base Layer 2

In [None]:
#Training Customers
training_customers = pd.read_csv("./train_customers.csv")
print("\n This table has " + str(len(training_customers)) + " rows.")
training_customers.head()
#training_customers.isnull().sum()

### Base Layer 3

In [None]:
#Orders
orders = pd.read_csv("./orders.csv")
print("\n This table has " + str(len(orders)) + " rows.")
orders.head()
#orders.isnull().sum()

### Base Layer 4

In [None]:
#Vendors
vendors = pd.read_csv("./vendors.csv")
print("\n This table has " + str(len(vendors)) + " rows.")
vendors.head()
#vendors.isnull().sum()

## Pre-process tables where need be

In [None]:
startTime = currentSecondsTime()

### Add valuable calculated fields

In [None]:
#Add geometry field to the training locations table
geometry = [Point(xy) for xy in zip(training_locations.longitude, training_locations.latitude)]
crs = {'init': 'epsg:4326'} #crs assumed
training_locations_gdf = gpd.GeoDataFrame(training_locations, crs=crs, geometry=geometry)

In [None]:
#Add geometry field to the vendors table
geometry = [Point(xy) for xy in zip(vendors.longitude, vendors.latitude)]
crs = {'init': 'epsg:4326'} #crs assumed
vendors_gdf = gpd.GeoDataFrame(vendors, crs=crs, geometry=geometry)
vendors_gdf.head

### Compose focus training table

In [None]:
# Deduce reference columns in orders table
training_table = deepcopy(orders)

def func(training_table):
    for index, row in training_table.iterrows():
        training_table.loc[index, "customer_id"] = training_table["CID X LOC_NUM X VENDOR"][index].split(" X ")[0]
        training_table.loc[index, "location_number"] = training_table["CID X LOC_NUM X VENDOR"][index].split(" X ")[1]
        training_table.loc[index, "vendor_id"] = training_table["CID X LOC_NUM X VENDOR"][index].split(" X ")[2]

cores=mp.cpu_count()

df_split = np.array_split(training_table, cores, axis=0)

# create the multiprocessing pool
pool = Pool(cores)

# process the DataFrame by mapping function to each df across the pool
df_out = np.vstack(pool.map(func, df_split))

# close down the pool and join
pool.close()
pool.join()
pool.clear()

training_table.columns

In [None]:
# merge training locations and training customers
training_customers= training_customers.rename({'akeed_customer_id': 'customer_id'}, axis=1)
compiled_training_customers = pd.merge(training_locations_gdf, training_customers, on ='customer_id', how ='right')
compiled_training_customers.head()

In [None]:
# merge compiled training customers to the training table
training_table = pd.merge(training_table, compiled_training_customers, on ='customer_id', how ='left')
training_table.head()

In [None]:
# merge vendors to the training table
vendors = vendors.rename({'id': 'vendor_id'}, axis=1)
vendors["vendor_id"]= vendors["vendor_id"].astype(str)
training_table["vendor_id"]= training_table["vendor_id"].astype(str)
training_table = pd.merge(training_table, vendors, on ='vendor_id', how ='left')
training_table.head()

### Deal with null values

In [None]:
#training_table = training_table.fillna(value="unknown")
#training_table.head()

### Deal with mixed data-type fields in training table

In [None]:
#find mixed data type columns
'''for col in training_table.columns:
    weird = (training_table[[col]].applymap(type) != training_table[[col]].iloc[0].apply(type)).any(axis=1)
    if len(training_table[weird]) > 0:
        print(col)'''

### Encode nominal fields in training table

In [None]:
#Location Type column
dummy = pd.get_dummies(training_table['location_type'])
encoded_training_table = training_table.merge(dummy, left_index=True, right_index=True)
encoded_training_table.head()

In [None]:
#Gender column
encoded_training_table['gender'] = encoded_training_table['gender'].str.upper()
encoded_training_table['gender'] = encoded_training_table['gender'].str.strip()
dummy = pd.get_dummies(encoded_training_table['gender'])
encoded_training_table = encoded_training_table.merge(dummy, left_index=True, right_index=True)
encoded_training_table.head()

### Feature Scale field values if need be, in training table

In [None]:
#sc = StandardScaler()
#scaled_X = sc.fit_transform(X)

### Separate the vendor_id column of the training table as the y variable then encode it

In [None]:
# Separating training table into x and y (y being vendor_id)

In [None]:
# Encoding the y set

In [None]:
endTime = currentSecondsTime()
showPyMessage(" -- Preprocessing done. Took {}".format(timeTaken(startTime, endTime)))

## Exploring Relationships using plots

### Exploration 1

In [None]:
#Spatial Relations

ax = encoded_training_table.plot(color = 'grey', figsize = (18, 12), legend=True)
#countries_gdf.plot(ax=ax, edgecolor = 'grey', facecolor = 'none')
#ax.set(xlim=(15, 35), ylim=(-37.5, -20))
ax.set_axis_off()
plt.title(label = 'Customers & Vendors Locations', fontweight = 'bold')
plt.tight_layout()

### Exploration 2

In [None]:
# Linear Relations



## Prediction Modelling