In [1]:
import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm

In [2]:
from geopy.distance import geodesic

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import chardet

In [5]:
df = pd.read_csv("London postcodes.csv", delimiter=',')

(327525, 53)

In [6]:
df.shape

(314746, 29)

In [7]:
df.head()

Unnamed: 0,Postcode,In Use?,Latitude,Longitude,Easting,Northing,GridRef,County,District,Ward,...,Population,Households,Built up area,Built up sub-division,Lower layer super output area,Rural/urban,Region,Altitude,London zone,LSOA Code
0,BR1 1AA,Yes,51.401546,0.015415,540291,168873,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,71,5.0,E01000675
1,BR1 1AB,Yes,51.406333,0.015208,540262,169405,TQ402694,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 008B,Urban major conurbation,London,71,4.0,E01000676
2,BR1 1AD,Yes,51.400057,0.016715,540386,168710,TQ403687,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,53,5.0,E01000675
3,BR1 1AE,Yes,51.404543,0.014195,540197,169204,TQ401692,Greater London,Bromley,Bromley Town,...,34.0,21.0,Greater London,Bromley,Bromley 018C,Urban major conurbation,London,71,4.0,E01000677
4,BR1 1AF,Yes,51.401392,0.014948,540259,168855,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,58,5.0,E01000675


In [8]:
df.columns

Index(['Postcode', 'In Use?', 'Latitude', 'Longitude', 'Easting', 'Northing',
       'GridRef', 'County', 'District', 'Ward', 'DistrictCode', 'WardCode',
       'Country', 'CountyCode', 'Constituency', 'Introduced', 'Terminated',
       'Parish', 'NationalPark', 'Population', 'Households', 'Built up area',
       'Built up sub-division', 'Lower layer super output area', 'Rural/urban',
       'Region', 'Altitude', 'London zone', 'LSOA Code'],
      dtype='object')

In [9]:
rides = pd.DataFrame(columns=['driver_id', 'client_id',\
                              'start', 'start_latitude', 'start_longtitude', \
                              'finish', 'finish_latitude', 'finish_longtitude', \
                              'distance', 'road_time', 'start_time', 'finish_time', 'cost', \
                              'driver_rate', 'category_driver_feedback', 'text_driver_feedback',\
                             'client_rate', 'category_client_feedback', 'text_client_feedback'])
NUM_RIDES = 500000

Drivers and clients id's

In [10]:
rides['driver_id'] = np.random.randint(low=0, high=2500, size=NUM_RIDES)
rides['client_id'] = np.random.randint(low=0, high=4500, size=NUM_RIDES)

Start and finish points

In [11]:
rides[['start', 'start_latitude', 'start_longtitude']] = df[['Postcode', 'Latitude', 'Longitude']].sample(n=NUM_RIDES, replace=True).reset_index(drop=True)

In [12]:
rides[['finish', 'finish_latitude', 'finish_longtitude']] = df[['Postcode', 'Latitude', 'Longitude']].sample(n=NUM_RIDES, replace=True).reset_index(drop=True)

Start time

In [13]:
def random_dates(start, end, n=10):
    start_u = start.value//10**9
    end_u = end.value//10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

start = pd.to_datetime('2010-01-01')
end = pd.to_datetime('2020-01-01')
rides['start_time'] = random_dates(start, end, NUM_RIDES)

Distance between start and finish points

In [14]:
rides['distance'] = [geodesic((x1, y1), (x2, y2)).km for x1, y1, x2, y2 in tqdm(zip(rides['start_latitude'], \
                                                                                              rides['start_longtitude'], \
                                                                                              rides['finish_latitude'], \
                                                                                              rides['finish_longtitude']), total=NUM_RIDES)]
rides['distance'] = rides['distance'].round(2)

  0%|          | 0/500000 [00:00<?, ?it/s]

Calculate road time

In [15]:
rides['road_time'] = abs(np.random.normal(size=NUM_RIDES, scale=10)) + rides['distance'] * abs(np.random.normal(size=NUM_RIDES, loc=1, scale=0.25))
rides['road_time'] = rides['road_time'].astype('int')
rides['road_time'] = pd.to_timedelta(rides['road_time'], unit='m')

Calculate finish time

In [16]:
rides['finish_time'] = rides['start_time'] + rides['road_time']

Calculate cost of the ride

In [17]:
def count_cost(start_time, distance):
    cost = 2 + 0.5 * distance
    if (start_time.hour >= 8 and start_time.hour <= 9) or \
        (start_time.hour >= 18 and start_time.hour <= 19):
        cost *= 1.5
    if (start_time.hour >= 22 or start_time.hour <= 6):
        cost *= 1.3
    return cost
    
rides['cost'] = [count_cost(s, d) for s, d in tqdm(zip(rides.start_time, rides.distance), total=NUM_RIDES)]
rides['cost'] = rides['cost'].round(2)

  0%|          | 0/500000 [00:00<?, ?it/s]

Drivers rates

In [18]:
driver_rate_idx = np.random.randint(low=0, high=NUM_RIDES, size=int(NUM_RIDES*0.3))
driver_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(NUM_RIDES*0.3))
rides['driver_rate'][driver_rate_idx] = np.where(driver_rate_distribution_arr == 1)[1] + 1

In [19]:
driver_feedback_categories_good = ['great service', 'nice car', 'wonderful companion', 'neat and tidy', 'expert navigation', 'recommend']
driver_feedback_categories_bad = ['awful service', 'bad car', 'unpleasant companion', 'dirty', 'non-expert navigation', 'not recommend']

driver_feedback_bad = [
  "Driver took a longer route, increasing the fare unnecessarily.",
  "Cab had a strange odor; it was extremely unpleasant during the entire ride.",
  "Rude and unfriendly driver, made the journey uncomfortable.",
  "The taxi was dirty, and the seats were stained. Hygiene is a serious concern.",
  "Driver was constantly on the phone, not paying attention to the road.",
  "Waited for the taxi for an extended period; the tardiness was frustrating.",
  "The vehicle had a strange noise, making the ride unsettling and loud.",
  "Driver didn't follow GPS directions and got lost multiple times.",
  "Taxi interior was outdated and in poor condition; it felt unsafe.",
  "The driver was driving recklessly, putting passengers at risk.",
  "Unpleasant experience with the driver's inappropriate conversation.",
  "Cab had a malfunctioning air conditioner; it was stifling inside.",
  "Driver refused to take a credit card, even though it's a listed payment option.",
  "Taxi was not properly cleaned; there were crumbs and trash everywhere.",
  "Unexplained additional charges on the fare; unclear billing practices.",
  "Driver was playing loud music without considering passenger preferences.",
  "The taxi smelled strongly of cigarette smoke, making the ride unbearable.",
  "Driver didn't assist with luggage, despite it being part of the service.",
  "Taxi had a flat tire, causing unnecessary delays and inconvenience.",
  "Driver had poor knowledge of the city and took a longer route.",
  "The vehicle's brakes were squeaking, creating a worrisome atmosphere.",
  "Driver was talking on the phone loudly in a language passengers couldn't understand.",
  "Cab was unclean, and the windows were so dirty it was hard to see outside.",
  "Driver didn't respect traffic rules, making the journey nerve-wracking.",
  "Taxi had a strong, unpleasant odor that lingered throughout the ride.",
  "Driver was speeding excessively, making passengers feel unsafe.",
  "Cab had a malfunctioning meter, leading to disputes over the fare.",
  "The driver was texting while driving, endangering everyone in the vehicle.",
  "Unexplained detours that extended the journey unnecessarily.",
  "Taxi had a broken seatbelt, posing a safety risk to passengers.",
  "Driver was visibly fatigued, compromising the safety of the ride.",
  "Taxi arrived late, causing me to miss an important appointment.",
  "Driver was talking on the phone the entire time, ignoring my requests for silence.",
  "Uncomfortable seating in the cab; felt like riding on a wooden bench.",
  "The taxi had a strange vibration, making the ride nauseating.",
  "Driver didn't use turn signals, making the journey feel unsafe.",
  "Cab had a strange rattling noise that persisted throughout the entire trip.",
  "Driver didn't provide a receipt upon request; unclear fare breakdown.",
  "Taxi was excessively hot; the air conditioning wasn't working properly.",
  "The driver overcharged me, and when questioned, became confrontational.",
  "Unprofessional attire of the driver; it didn't instill confidence.",
  "Taxi was in poor mechanical condition; it stalled multiple times.",
  "Driver took a longer route, claiming it was to avoid traffic, but it felt like a scam.",
  "Cab had a leak during the rain, causing discomfort for passengers.",
  "Driver was speeding and weaving through traffic recklessly.",
  "Taxi had an odd smell of cleaning chemicals, making the ride unpleasant.",
  "The driver was rude and argumentative, creating a hostile atmosphere.",
  "Cab was visibly dirty, and the dashboard was covered in grime.",
  "Driver didn't assist with luggage and seemed annoyed when asked.",
  "Taxi had a malfunctioning radio that emitted a loud, irritating noise.",
  "The driver was constantly checking their phone for messages while driving.",
  "Cab had a broken side mirror, posing a safety hazard.",
  "Driver didn't apologize for getting lost and blamed it on the GPS.",
  "Taxi had a malfunctioning meter, and the fare was significantly higher than expected.",
  "The driver didn't follow traffic rules, making the ride chaotic.",
  "Cab was missing a headrest, making it uncomfortable for passengers.",
  "Driver was driving aggressively, honking excessively at other vehicles.",
  "Taxi had a strange vibration that made it difficult to hold a conversation.",
  "The driver was talking loudly on a personal call, ignoring passenger discomfort.",
  "Cab had a lingering unpleasant smell from a previous passenger.",
  "Driver took a circuitous route, unnecessarily extending the journey."
]
driver_feedback_good = [
  "Arrived on time, excellent service!",
  "Polite and friendly driver, made the journey enjoyable.",
  "Clean and well-maintained taxi, a comfortable ride.",
  "Driver was helpful with luggage, great customer service.",
  "Prompt arrival, I appreciate the efficiency.",
  "Smooth ride, the driver navigated traffic well.",
  "Impressed with the cleanliness of the taxi interior.",
  "Safe and secure driving, felt comfortable throughout.",
  "Driver was knowledgeable about the best routes.",
  "Excellent communication from the driver.",
  "Efficient service, got me to my destination quickly.",
  "Comfortable seating and a smooth ride.",
  "Courteous driver, made the journey pleasant.",
  "Vehicle in good condition, no issues during the trip.",
  "Professionalism displayed by the driver was commendable.",
  "Fair pricing, great value for the service provided.",
  "The driver was patient in heavy traffic, appreciated that.",
  "Clean exterior, well-presented taxi.",
  "Quick response time, didn't have to wait long for the taxi.",
  "Driver took a scenic route, enjoyed the view.",
  "Well-maintained taxi, felt safe throughout the journey.",
  "Driver was attentive to my needs, excellent service.",
  "Smooth and hassle-free booking process.",
  "Impressed with the cleanliness and hygiene measures.",
  "The driver was polite and respectful.",
  "Great conversation with the driver, made the ride enjoyable.",
  "Quick and easy payment process.",
  "Safe driving in adverse weather conditions, appreciated the caution.",
  "Arrived ahead of schedule, efficient service.",
  "Clean and comfortable seating, a relaxing journey.",
  "Driver provided useful local tips, appreciated the guidance.",
  "Punctual service, reliable for early morning travel.",
  "Efficient drop-off at the exact destination.",
  "Driver was courteous and accommodating.",
  "The taxi was well-equipped with modern amenities.",
  "Driver was proactive in avoiding traffic, great navigation.",
  "Smooth ride, no sudden stops or jerks.",
  "Excellent service from booking to drop-off.",
  "Driver was friendly and made the ride enjoyable.",
  "Clean and well-ventilated interior, felt fresh.",
  "Fair and transparent pricing, no hidden fees.",
  "Driver was patient and understanding.",
  "Comfortable temperature inside the taxi, adjusted as per request.",
  "Prompt pickup, no delays in reaching the destination.",
  "Efficient route taken, avoided unnecessary detours.",
  "Driver provided a smooth and enjoyable ride.",
  "Clean exterior, well-maintained appearance.",
  "Safety measures followed, felt secure throughout the journey.",
  "Driver was attentive to traffic updates, ensuring a smooth ride.",
  "Timely communication from the driver.",
  "The taxi had ample space for luggage.",
  "Driver was knowledgeable about local attractions.",
  "Seamless and quick drop-off process.",
  "Clean and tidy taxi interior, well-organized.",
  "Driver maintained a comfortable driving speed.",
  "Efficient and friendly customer support.",
  "The driver was well-dressed and presented a professional image.",
  "Smooth pick-up process, no waiting time.",
  "Driver was courteous and respectful throughout the journey.",
  "Clean and well-maintained exterior of the taxi.",
  "Pleasant aroma inside the taxi, a nice touch.",
  "Driver followed the best routes for minimal traffic.",
  "On-time arrival, appreciated the punctuality.",
  "Comfortable seating arrangement, suitable for long rides.",
  "Driver displayed excellent driving skills.",
  "Quick and easy payment options available.",
  "Driver took extra care in ensuring a smooth ride.",
  "Clean and hygienic taxi interior, felt fresh and sanitized.",
  "Driver went the extra mile to assist with luggage.",
  "Efficient and reliable service, met expectations.",
  "Smooth and well-coordinated communication with the driver.",
  "Comfortable temperature maintained inside the taxi.",
  "Driver was polite and greeted me warmly.",
  "Clean and well-maintained exterior, a professional appearance.",
  "Efficient navigation through busy city streets.",
  "Driver was well-versed in providing information about the city.",
  "Prompt response to booking inquiries.",
  "The taxi was equipped with a reliable GPS system.",
  "Driver adhered to all traffic rules, felt safe.",
  "Courteous and friendly interactions with the driver.",
  "Clean and well-groomed appearance of the driver.",
  "Efficient and organized pick-up process.",
  "Driver provided a smooth and enjoyable ride experience.",
  "Well-lit interior, felt safe during night travel.",
  "Punctual arrival, no unnecessary waiting time.",
  "Driver ensured a quiet and peaceful atmosphere in the taxi.",
  "Clean and well-maintained interior, a pleasant journey.",
  "Driver was proactive in handling unexpected road closures.",
  "Efficient and timely drop-off at the desired location.",
  "Smooth ride, no abrupt stops or starts.",
  "Driver displayed excellent knowledge of alternative routes.",
  "Courteous and professional behavior from the driver.",
  "Clean and well-ventilated interior, a comfortable journey.",
  "Driver was patient and accommodating to specific requests.",
  "Prompt and reliable service, met expectations.",
  "Efficient handling of luggage, appreciated the assistance.",
  "Smooth and well-coordinated communication with customer support.",
  "Comfortable seating arrangement, suitable for multiple passengers.",
  "Driver provided helpful recommendations for local attractions.",
  "Clean and well-maintained exterior, a positive first impression.",
  "Efficient navigation through challenging traffic conditions.",
  "Driver displayed a high level of customer service.",
  "Prompt confirmation of booking and arrival details.",
  "The taxi was equipped with comfortable amenities.",
  "Driver adhered to a professional and safe driving style.",
  "Courteous and respectful interactions with the driver.",
  "Clean and well-organized interior, a pleasant travel experience.",
  "Efficient pick-up process, no delays in reaching the location.",
  "Smooth ride, no disturbances throughout the journey.",
  "Driver was attentive to passenger preferences, a personalized experience."
]

In [20]:
category_driver_good_feedback_idx = np.random.choice(rides[rides.driver_rate > 3].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_driver_feedback"][category_driver_good_feedback_idx] = np.random.choice(driver_feedback_categories_good, size=int(NUM_RIDES*0.3*0.2))

category_driver_bad_feedback_idx = np.random.choice(rides[rides.driver_rate < 4].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_driver_feedback"][category_driver_bad_feedback_idx] = np.random.choice(driver_feedback_categories_bad, size=int(NUM_RIDES*0.3*0.2))

In [21]:
text_good_feedback_driver_length = np.random.randint(low=0, high=7, size=int(NUM_RIDES*0.3*0.2))
text_good_feedback_driver_sample = [random.sample(driver_feedback_good, i) for i in text_good_feedback_driver_length]
rides['text_driver_feedback'][category_driver_good_feedback_idx] = text_good_feedback_driver_sample

text_bad_feedback_driver_length = np.random.randint(low=0, high=7, size=int(NUM_RIDES*0.3*0.2))
text_bad_feedback_driver_sample = [random.sample(driver_feedback_bad, i) for i in text_bad_feedback_driver_length]
rides['text_driver_feedback'][category_driver_bad_feedback_idx] = text_bad_feedback_driver_sample

Clients rates

In [22]:
client_rate_idx = np.random.randint(low=0, high=NUM_RIDES, size=int(NUM_RIDES*0.5))
client_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(NUM_RIDES*0.5))
rides['client_rate'][client_rate_idx] = np.where(client_rate_distribution_arr == 1)[1] + 1

In [23]:
client_feedback_categories_good = ['polite', 'pleasant', 'quiet', 'neat and tidy', 'recommend']
client_feedback_categories_bad = ['unpolite', 'unpleasant', 'loud', 'dirty','not recommend']

In [24]:
category_client_good_feedback_idx = np.random.choice(rides[rides.client_rate > 3].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_client_feedback"][category_client_good_feedback_idx] = np.random.choice(client_feedback_categories_good, size=int(NUM_RIDES*0.3*0.2))

category_client_bad_feedback_idx = np.random.choice(rides[rides.client_rate < 4].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_client_feedback"][category_client_bad_feedback_idx] = np.random.choice(client_feedback_categories_bad, size=int(NUM_RIDES*0.3*0.2))

In [25]:
text_good_feedback_client_length = np.random.randint(low=0, high=6, size=int(NUM_RIDES*0.3*0.2))
text_good_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_good_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_good_feedback_client_sample

text_bad_feedback_client_length = np.random.randint(low=0, high=6, size=int(NUM_RIDES*0.3*0.2))
text_bad_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_bad_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_bad_feedback_client_sample

In [26]:
rides.head(15)

Unnamed: 0,driver_id,client_id,start,start_latitude,start_longtitude,finish,finish_latitude,finish_longtitude,distance,road_time,start_time,finish_time,cost,driver_rate,category_driver_feedback,text_driver_feedback,client_rate,category_client_feedback,text_client_feedback
0,1156,2050,KT1 9GG,51.404779,-0.292849,SW1P 4ZT,51.481035,-0.135804,13.83,0 days 00:20:00,2015-07-07 15:36:30,2015-07-07 15:56:30,8.91,,,,,,
1,1964,2582,UB1 1SX,51.510379,-0.37596,NW3 5DD,51.550084,-0.173859,14.7,0 days 00:32:00,2012-12-13 19:47:32,2012-12-13 20:19:32,14.02,5.0,,,1.0,,
2,1810,1985,E15 4FF,51.538188,0.005036,BR2 9HU,51.396246,0.02546,15.86,0 days 00:24:00,2017-01-26 02:04:53,2017-01-26 02:28:53,12.91,1.0,unpleasant companion,[],,,
3,2020,1570,W6 9UE,51.486474,-0.224941,SW1X 9JE,51.49887,-0.158157,4.84,0 days 00:15:00,2019-10-03 10:35:32,2019-10-03 10:50:32,4.42,,,,,,
4,120,2309,W8 7JN,51.506061,-0.2032,SW3 2BD,51.49626,-0.169528,2.58,0 days 00:18:00,2014-12-22 22:06:31,2014-12-22 22:24:31,4.28,,,,,,
5,1399,3438,N8 8YQ,51.58647,-0.112639,TW1 1QH,51.45781,-0.321056,20.35,0 days 00:16:00,2014-12-13 13:21:14,2014-12-13 13:37:14,12.18,,,,,,
6,751,2069,SW13 0ZE,51.469285,-0.266096,BR6 0EA,51.376753,0.089794,26.81,0 days 00:25:00,2010-09-20 17:32:42,2010-09-20 17:57:42,15.4,,,,4.0,,
7,335,467,EC1Y 8UJ,51.524921,-0.087953,SE15 4NB,51.467888,-0.067504,6.5,0 days 00:43:00,2013-05-03 06:10:28,2013-05-03 06:53:28,6.82,,,,,,
8,2362,2754,W1S 2DT,51.512233,-0.142255,NW10 4EY,51.538102,-0.244217,7.64,0 days 00:13:00,2016-06-28 23:04:45,2016-06-28 23:17:45,7.57,,,,5.0,,
9,2446,266,EC2Y 5DJ,51.517977,-0.091675,SW11 3PA,51.474668,-0.167728,7.15,0 days 00:13:00,2013-06-30 21:47:30,2013-06-30 22:00:30,5.58,,,,5.0,,


In [27]:
rides.to_csv("rides.csv")

docker-compose exec router sh -c "mongoimport --port 27017 -d taxi -c rides --type csv --file scripts/rides.csv --headerline"
docker-compose exec router sh -c "mongosh < /scripts/query-data.js"

In [29]:
type(rides.text_driver_feedback[240])


list