In [1]:
import pandas as pd
import numpy  as np
import time   # To calculate the execution time
import bisect # For the WeightedRandomGenerator
import random # For the generation of random numbers
from numpy.random import choice # For generation of ordered list based on weight
from collections import defaultdict  # For dictionary that memorize the cumulative probability of import/expor
import copy # For function deepcopy
import math # For function floor that convert float numbers in integers

# The dataset was created by using a food dataset and saved outside because the original dataset was huge (1GB).
product_list = pd.read_csv('categories.csv', 
                           delimiter = ',', 
                           encoding  = 'utf-8')

# I delete few strange strings
product_list = product_list.drop([123, 157, 440, 442])

final_product_list                     = pd.DataFrame(product_list['Name'])
final_product_list['Popularity']       = np.random.random_sample(len(final_product_list))

# Index reset due to problems with non-consecutive indexes
final_product_list.reset_index(drop    = True, 
                               inplace = True)

In [2]:
final_product_list["DerivativeFrom"] = np.empty((len(final_product_list), 0)).tolist()

In [3]:
# Set the maximum and minimum number of derivates that each product can have
def derivativeGeneration(products, end, start = 0, minDerivative = 0, maxDerivative = 2):
    global final_product_list
    rndSelection = random.sample(products, random.randint(minDerivative, maxDerivative))
    # Check with two consecutive loops which product derives from our sample and add them
    for i in range(0, 1):
        olderDerivative = [(products[i], final_product_list.loc[i, 'DerivativeFrom']) for i in range(start, end)
                           if products[i] in rndSelection]
        if olderDerivative:
            for derivative in olderDerivative:
                rndSelection.extend(derivative[1])
                rndSelection = list(set(rndSelection))
    return rndSelection

In [4]:
# Generate all the derivatives
derivatives = np.empty((len(final_product_list), 0)).tolist()
for i in range(0, len(final_product_list)):
    derivatives[i] = derivativeGeneration(list(product_list["Name"]), i)
    final_product_list['DerivativeFrom'] = derivatives

In [5]:
cities = pd.read_csv('world-cities.csv', 
                     delimiter = ',', 
                     encoding  = 'utf-8')

# Set the number of cities to 100
nCities = 100
cities  = cities.sample(nCities)

# Use of manual created list of Northern Italy cities

cities = pd.DataFrame({'name': ['Milano', 'Bergamo', "Brescia", "Verona", "Padova", "Treviso", "Venezia",
                                "Trento", "Bolzano", "Bologna", "Parma", "Mantova", "Rovigo", "Modena",
                                "Ravenna"]})


# Index reset due to problems with non-consecutive indexes
cities.reset_index(drop    = True, 
                   inplace = True)
city_list         = pd.DataFrame(cities.name)
city_list.columns = ['Name']

In [96]:
# Creation of a probability vector for each city based on popularity of products and random coefficient
# from 0 to 1
rand = [np.random.random_sample(len(final_product_list)) for i in range(0,len(city_list))]
popularity = final_product_list.Popularity
name = final_product_list.Name

priorityExp = [popularity * rand[i] for i in range(0,len(city_list))]
probabilityExport = [p / sum(p) for p in priorityExp]

# List of each product in order of probability and then insert it in the dataset his coefficient
productsOrderExp = [choice(name, len(final_product_list), p = prob, replace = False)
                 for prob in probabilityExport]
city_list["ImportanceExport"] = [[(l[i], (1 - i / len(l))) for i in range(0,len(l))] for l in productsOrderExp]

# Collect derivatives for each product
possibleDerivatives = [[(importance[1],
  final_product_list.loc[final_product_list.Name == importance[0], 'DerivativeFrom']) for importance in city]
                       for city in city_list.ImportanceExport]

coefficients =   [[(elements[0] * (final_product_list.loc[final_product_list.Name == el, 'Popularity']**2), el)
                 for elements in city
                 for elem in  elements[1]
                 for el in elem] 
                 for city in possibleDerivatives]
probabilityIncrease = []
for city in coefficients:
    prob = defaultdict(int)
    for value, key in city:
        prob[key] += value
    probabilityIncrease.append(prob)
priorityImp = [[probabilityIncrease[i][final_product_list.Name[n]] + (1 - rand[i][n]) for n in range(0,len(final_product_list))] for i in range(0,len(city_list))]
priorityImpCleaned = []
for p in priorityImp:
    tmp = []
    for val in p:
        tmp.append(float(val))
    priorityImpCleaned.append(tmp)
probabilityImport = [[val / sum(p) for val in p] for p in priorityImpCleaned]

productsOrderImp = [choice(name, len(final_product_list), p = prob, replace = False)
                 for prob in probabilityImport]
city_list["ImportanceImport"] = [[(l[i], (1 - i / len(l))) for i in range(0,len(l))] for l in productsOrderImp]

In [107]:
def standard_info(i, rid, tcd, trip, final_trip):
    final_trip.loc[i, 'RouteId']   = rid
    final_trip.loc[i, 'Order']     = i
    final_trip.loc[i, 'TruckCode'] = tcd
    final_trip.loc[i, 'StartCity'] = trip.loc[i    , 'IdCity']
    final_trip.loc[i, 'EndCity']   = trip.loc[i + 1, 'IdCity']

def first_step_creation(rid, tcd, length, trip):
    final_trip   = pd.DataFrame(columns = ['RouteId', 'Order', 'TruckCode', 'StartCity', 'EndCity', 'Items'])
    standard_info(0, rid, tcd, trip, final_trip)
    itemsToImport = [(item[0], item[1]*item2[1]) for item in trip.loc[length - 2, "ProductImportance"]
                                 for item2 in trip.loc[length - 2, "PopularImport"] if item[0] == item2[0]]
    tot = sum(i[1] for i in itemsToImport)
    itemList = [i[0] for i in itemsToImport]
    itemValues = [i[1]/tot for i in itemsToImport]
    # I set the range to be not fixed to 5-15 for the first trip cities that have few exportable items and I set
    # them to half to all the possible exportable items
    numItems = np.random.randint(5,15)
    exportedItems = choice(itemList, numItems, p = itemValues, replace = False)
    final_trip.loc[length - 2, 'Items'] = set(exportedItems)
    return final_trip
    
def trip_creation(i, rid, tcd, length, trip, final_trip):
    standard_info(i - 1, rid, tcd, trip, final_trip)
    crev            = length - i - 1
    exportPriority = city_list.loc[(list(city_list.Name).index(trip.loc[crev + 1, 'IdCity'])),
                                   'ImportanceExport']
    removeItems = [expPriority[0] for expPriority in exportPriority if random.random() < expPriority[1]]
    possibleExports = list(final_trip.loc[crev + 1, 'Items'])
    itemsToDelete   = set(possibleExports).intersection(removeItems)
    for item in itemsToDelete:
        possibleExports.remove(item)
    # Number of new exported items
    numItems = np.random.randint(3, 15)
    itemsToImport = [(item[0], item[1]*item2[1]) for item in trip.loc[length - 2, "ProductImportance"]
                                 for item2 in trip.loc[length - 2, "PopularImport"] if item[0] == item2[0]]
    tot = sum(i[1] for i in itemsToImport)
    itemList = [i[0] for i in itemsToImport]
    itemValues = [i[1]/tot for i in itemsToImport]
    exportedItems = choice(itemList, numItems, p = itemValues, replace = False)
    final_trip.loc[crev, 'Items'] = set(possibleExports).union(exportedItems)
    return final_trip

In [113]:
# Range of trip length setted at 3:10. We set the number of trucks to 1000
minLen   = 3
maxLen   = 10
numTruck = 50

def generate_trip(i):
    length = np.random.randint(minLen, maxLen)
    trip   = pd.DataFrame(columns = ['IdCity', 'ProductImportance', 'PopularImport'])
    Exports = defaultdict(int)
    Imports = defaultdict(int)
    cities = []
    for c in range(0, length):
        city     = np.random.randint(0, len(city_list))
        while city in cities:
            city = np.random.randint(0, len(city_list))
        cities.append(city)
        trip.loc[c, 'IdCity'] = city_list.loc[city, 'Name']
        for cl in city_list.loc[city, 'ImportanceExport']:
            Exports[cl[0]] += cl[1]
        trip.loc[c, 'ProductImportance'] = [(item, Exports[item]) for item in Exports]
        trip.loc[c, 'PopularImport']   = city_list.loc[city, 'ImportanceImport']
    rid = i
    tcd = np.random.randint(0, numTruck)
    final_trip = first_step_creation(rid, tcd, length, trip)
    for c in range(2,length):
        final_trip = trip_creation(c, rid, tcd, length, trip, final_trip)
    final_trip = final_trip.sort_values('Order')
    return final_trip

In [114]:
# Generate n trips
def generate_trips(n):
    return [generate_trip(i) for i in range(0, n)]

In [115]:
numberOfRoutes = 150
routes         = generate_trips(numberOfRoutes)

In [116]:
routes

[  RouteId Order TruckCode StartCity  EndCity  \
 0       0     0        16   Bolzano   Rovigo   
 1       0     1        16    Rovigo   Verona   
 2       0     2        16    Verona    Parma   
 3       0     3        16     Parma  Ravenna   
 4       0     4        16   Ravenna  Venezia   
 
                                                Items  
 0  {Sardines-in-brine, Cauliflower-cheese, Mayona...  
 1  {Kombucha, Mango-drink, Mayonaise, Bouillonter...  
 2  {Ale, Burrito-kit, Mayonaise, Dindes, Senate-b...  
 3  {Miels, Dolminades, Ale, Yaourts-aux-fruits, C...  
 4  {Fuet-met-zwarte-peper, Kase, Beef-broth, Yaou...  ,
   RouteId Order TruckCode StartCity  EndCity  \
 0       1     0        30   Bolzano  Ravenna   
 1       1     1        30   Ravenna   Rovigo   
 2       1     2        30    Rovigo    Parma   
 3       1     3        30     Parma  Bergamo   
 4       1     4        30   Bergamo  Venezia   
 5       1     5        30   Venezia   Verona   
 
                      

In [117]:
all_trips = pd.concat(routes)

In [118]:
# Save the trips for the mining part
all_trips.to_csv("route_trips.csv")