In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from geopy.distance import geodesic
import random
import datetime
# from datetime import datetime, timedelta, strftime  # Импортируйте strftime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# adresses = pd.read_csv('london_postcodes-ons-postcodes-directory-feb22.csv', delimiter=',')
adresses = pd.read_csv('London_postcodes.csv', delimiter=',')

adresses.head()

Unnamed: 0,Postcode,In Use?,Latitude,Longitude,Easting,Northing,GridRef,County,District,Ward,...,Population,Households,Built up area,Built up sub-division,Lower layer super output area,Rural/urban,Region,Altitude,London zone,LSOA Code
0,BR1 1AA,Yes,51.401546,0.015415,540291,168873,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,71,5.0,E01000675
1,BR1 1AB,Yes,51.406333,0.015208,540262,169405,TQ402694,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 008B,Urban major conurbation,London,71,4.0,E01000676
2,BR1 1AD,Yes,51.400057,0.016715,540386,168710,TQ403687,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,53,5.0,E01000675
3,BR1 1AE,Yes,51.404543,0.014195,540197,169204,TQ401692,Greater London,Bromley,Bromley Town,...,34.0,21.0,Greater London,Bromley,Bromley 018C,Urban major conurbation,London,71,4.0,E01000677
4,BR1 1AF,Yes,51.401392,0.014948,540259,168855,TQ402688,Greater London,Bromley,Bromley Town,...,,,Greater London,Bromley,Bromley 018B,Urban major conurbation,London,58,5.0,E01000675


In [3]:
taxi_rides = pd.DataFrame(
    columns=['driver_id', 'trip_id', 'client_id', 'start_point', 'end_point', 'distance', 'start_time',
             'end_time', 'road_time', 'cost', 'driver_rate', 'client_rate', 'category_driver_feedback', 
             'text_driver_feedback', 'category_client_feedback', 'text_client_feedback'
             ]
    )

In [4]:
num_rows = 5_000_000
num_drivers = 5000
num_clients = 2000

start_date = datetime.datetime(2023, 1, 1)
end_date = datetime.datetime(2024, 1, 1)


driver_id_range = range(1, num_drivers + 1)
client_id_range = range(1, num_clients + 1)


random_addresses = adresses.sample(n=num_rows, random_state=42, replace=True)

# Заполняем столбцы start_point и end_point случайными парами широты и долготы
taxi_rides['start_point'] = list(zip(random_addresses['Latitude'], random_addresses['Longitude']))
taxi_rides['end_point'] = list(zip(random_addresses['Latitude'].sample(frac=1, random_state=42), random_addresses['Longitude'].sample(frac=1, random_state=42)))

taxi_rides['driver_id'] = [random.choice(driver_id_range) for i in range(taxi_rides.shape[0])]
taxi_rides['client_id'] = [random.choice(client_id_range) for i in range(taxi_rides.shape[0])]

# Рассчитываем расстояние
taxi_rides['distance'] = [geodesic(start, end).km for start, end in taxi_rides[['start_point', 'end_point']].values]
taxi_rides['distance'] = taxi_rides['distance'].round(2)

taxi_rides['start_time'] = [datetime.datetime.strftime(start_date + datetime.timedelta(seconds=random.randint(0, 31536000)), '%Y-%m-%d %H:%M:%S') for i in range(num_rows)]
taxi_rides['road_time'] = [datetime.timedelta(seconds=distance / 50 * 3600) for distance in taxi_rides['distance']]
taxi_rides['end_time'] = [datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + road_time for start_time, road_time in zip(taxi_rides['start_time'], taxi_rides['road_time'])]

taxi_rides['start_time'] = pd.to_datetime(taxi_rides['start_time'], format='%Y-%m-%d %H:%M:%S')
taxi_rides['end_time'] = pd.to_datetime(taxi_rides['end_time'], format='%Y-%m-%d %H:%M:%S')

taxi_rides['trip_id'] = range(1, len(taxi_rides) + 1)


In [5]:
def count_cost(start_time, distance):
    cost = 5 + .5 * distance
    if (start_time.hour >= 8 and start_time.hour <= 11) or (start_time.hour >= 17 and start_time.hour <= 19):
        cost *= 1.5

    if (start_time.hour >= 23 and start_time.hour <= 6):
        cost *= 1.75
    return cost


taxi_rides['cost'] = [count_cost(time, dist) for time, dist in zip(taxi_rides['start_time'], taxi_rides['distance'])]
taxi_rides['cost'] = taxi_rides['cost'].round(2)

In [6]:
driver_rate_idx = np.random.randint(low=0, high=num_drivers, size=int(num_rows*0.85))
driver_rate_distr_arr = np.random.multinomial(1, [.2, .05, .1, .25, .4], size=int(num_rows*0.85))
taxi_rides['driver_rate'][driver_rate_idx] = np.where(driver_rate_distr_arr == 1)[1] + 1

client_rate_idx = np.random.randint(low=0, high=num_drivers, size=int(num_rows*0.75))
client_rate_distr_arr = np.random.multinomial(1, [.2, .05, .1, .25, .4], size=int(num_rows*0.75))
taxi_rides['client_rate'][driver_rate_idx] = np.where(driver_rate_distr_arr == 1)[1] + 1

In [7]:
driver_fb_cat_good = ['good service', 'nice car', 'expert navigation']
driver_fb_cat_bad = ['awful service', 'bad car', 'dirty', 'rude']

client_fb_cat_good = ['polite', 'pleasant', 'quiet']
client_fb_cat_bad = ['unpolite', 'unpleasant', 'loud', 'dirty']

category_driver_good_feedback_idx = np.random.choice(taxi_rides[taxi_rides.driver_rate > 3].index, size=int(num_rows*0.3*0.2))
taxi_rides["category_driver_feedback"][category_driver_good_feedback_idx] = np.random.choice(driver_fb_cat_good, size=int(num_rows*0.3*0.2))

category_driver_bad_feedback_idx = np.random.choice(taxi_rides[taxi_rides.driver_rate < 4].index, size=int(num_rows*0.3*0.2))
taxi_rides["category_driver_feedback"][category_driver_bad_feedback_idx] = np.random.choice(driver_fb_cat_bad, size=int(num_rows*0.3*0.2))


text_good_feedback_driver_length = np.random.randint(low=0, high=len(driver_fb_cat_good), size=int(num_rows*0.3*0.2))
text_good_feedback_driver_sample = [random.sample(driver_fb_cat_good, i) for i in text_good_feedback_driver_length]
taxi_rides['text_driver_feedback'][category_driver_good_feedback_idx] = text_good_feedback_driver_sample

text_bad_feedback_driver_length = np.random.randint(low=0, high=len(driver_fb_cat_bad), size=int(num_rows*0.3*0.2))
text_bad_feedback_driver_sample = [random.sample(driver_fb_cat_bad, i) for i in text_bad_feedback_driver_length]
taxi_rides['text_driver_feedback'][category_driver_bad_feedback_idx] = text_bad_feedback_driver_sample

# ------------------

category_client_good_feedback_idx = np.random.choice(taxi_rides[taxi_rides.client_rate > 3].index, size=int(num_rows*0.3*0.2))
taxi_rides["category_client_feedback"][category_client_good_feedback_idx] = np.random.choice(client_fb_cat_good, size=int(num_rows*0.3*0.2))

category_client_bad_feedback_idx = np.random.choice(taxi_rides[taxi_rides.client_rate < 4].index, size=int(num_rows*0.3*0.2))
taxi_rides["category_client_feedback"][category_client_bad_feedback_idx] = np.random.choice(client_fb_cat_bad, size=int(num_rows*0.3*0.2))


text_good_feedback_client_length = np.random.randint(low=0, high=len(client_fb_cat_good), size=int(num_rows*0.3*0.2))
text_good_feedback_client_sample = [random.sample(client_fb_cat_good, i) for i in text_good_feedback_client_length]
taxi_rides['text_client_feedback'][category_client_good_feedback_idx] = text_good_feedback_client_sample

text_bad_feedback_client_length = np.random.randint(low=0, high=len(client_fb_cat_bad), size=int(num_rows*0.3*0.2))
text_bad_feedback_client_sample = [random.sample(client_fb_cat_bad, i) for i in text_bad_feedback_client_length]
taxi_rides['text_client_feedback'][category_client_bad_feedback_idx] = text_bad_feedback_client_sample


In [8]:
taxi_rides

Unnamed: 0,driver_id,trip_id,client_id,start_point,end_point,distance,start_time,end_time,road_time,cost,driver_rate,client_rate,category_driver_feedback,text_driver_feedback,category_client_feedback,text_client_feedback
0,220,1,1441,"(51.62878, -0.172348)","(51.570856, -0.009083)",13.02,2023-09-03 15:31:15,2023-09-03 15:46:52.440,0 days 00:15:37.440000,11.51,4,4,good service,[],quiet,"[pleasant, quiet]"
1,3074,2,106,"(51.551131, -0.173485)","(51.51886, -0.141654)",4.22,2023-04-15 14:21:03,2023-04-15 14:26:06.840,0 days 00:05:03.840000,7.11,5,5,expert navigation,"[expert navigation, good service]",quiet,[quiet]
2,2869,3,929,"(51.580247, -0.1188)","(51.336611, -0.119961)",27.11,2023-08-07 23:29:41,2023-08-08 00:02:12.920,0 days 00:32:31.920000,18.56,2,2,awful service,"[dirty, awful service, bad car]",unpleasant,"[loud, dirty, unpolite]"
3,2082,4,853,"(51.546822, -0.442101)","(51.427511, -0.386121)",13.83,2023-12-23 18:52:09,2023-12-23 19:08:44.760,0 days 00:16:35.760000,17.87,3,3,rude,"[bad car, awful service]",dirty,"[loud, unpleasant, unpolite]"
4,125,5,303,"(51.566182, -0.137344)","(51.511651, -0.072714)",7.54,2023-05-29 02:27:54,2023-05-29 02:36:56.880,0 days 00:09:02.880000,8.77,5,5,nice car,[],polite,[polite]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,2638,4999996,1009,"(51.588022, -0.221766)","(51.497988, -0.215706)",10.03,2023-02-24 01:56:52,2023-02-24 02:08:54.160,0 days 00:12:02.160000,10.02,,,,,,
4999996,4561,4999997,409,"(51.512717, -0.146083)","(51.425729, -0.218363)",10.90,2023-06-27 11:01:52,2023-06-27 11:14:56.800,0 days 00:13:04.800000,15.68,,,,,,
4999997,4673,4999998,1184,"(51.568286, -0.007608)","(51.527917, -0.144024)",10.47,2023-05-15 23:43:13,2023-05-15 23:55:46.840,0 days 00:12:33.840000,10.24,,,,,,
4999998,2275,4999999,1098,"(51.519188, -0.15271)","(51.587977, -0.221811)",9.03,2023-06-08 08:21:26,2023-06-08 08:32:16.160,0 days 00:10:50.160000,14.27,,,,,,


In [9]:
taxi_rides.to_csv('taxi.csv',  index=False)