In [53]:
import pandas as pd
import numpy as np


In [54]:
from sklearn.cluster import KMeans
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor


In [55]:
train = pd.read_csv('train.csv', nrows=1000000)
train

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...,...
999995,2014-09-13 21:44:38.0000001,7.0,2014-09-13 21:44:38 UTC,-73.976676,40.785630,-73.959196,40.775114,1
999996,2010-09-20 14:50:37.0000002,7.3,2010-09-20 14:50:37 UTC,-73.992103,40.671385,-73.992103,40.671385,2
999997,2013-04-26 14:03:00.000000118,10.5,2013-04-26 14:03:00 UTC,-73.978423,40.751135,-73.993397,40.762197,5
999998,2011-07-08 00:29:00.00000099,6.9,2011-07-08 00:29:00 UTC,-73.980317,40.759482,-73.976832,40.743122,1


In [56]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])

In [57]:
train['year'] = train['pickup_datetime'].dt.year
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['day_of_week'] = train['pickup_datetime'].dt.dayofweek
train['hour'] = train['pickup_datetime'].dt.hour

In [58]:
train = train.drop(['key', 'pickup_datetime'], axis=1)
train

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour
0,4.5,-73.844311,40.721319,-73.841610,40.712278,1,2009,6,15,0,17
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,1,16
2,5.7,-73.982738,40.761270,-73.991242,40.750562,2,2011,8,18,3,0
3,7.7,-73.987130,40.733143,-73.991567,40.758092,1,2012,4,21,5,4
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,1,7
...,...,...,...,...,...,...,...,...,...,...,...
999995,7.0,-73.976676,40.785630,-73.959196,40.775114,1,2014,9,13,5,21
999996,7.3,-73.992103,40.671385,-73.992103,40.671385,2,2010,9,20,0,14
999997,10.5,-73.978423,40.751135,-73.993397,40.762197,5,2013,4,26,4,14
999998,6.9,-73.980317,40.759482,-73.976832,40.743122,1,2011,7,8,4,0


In [59]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Радиус Земли в километрах
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    return R * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))


In [60]:
train['distance'] = haversine_distance(train['pickup_latitude'], train['pickup_longitude'], train['dropoff_latitude'], train['dropoff_longitude'])
train = train.drop(columns=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])
train

Unnamed: 0,fare_amount,passenger_count,year,month,day,day_of_week,hour,distance
0,4.5,1,2009,6,15,0,17,1.030764
1,16.9,1,2010,1,5,1,16,8.450134
2,5.7,2,2011,8,18,3,0,1.389525
3,7.7,1,2012,4,21,5,4,2.799270
4,5.3,1,2010,3,9,1,7,1.999157
...,...,...,...,...,...,...,...,...
999995,7.0,1,2014,9,13,5,21,1.879763
999996,7.3,2,2010,9,20,0,14,0.000000
999997,10.5,5,2013,4,26,4,14,1.761741
999998,6.9,1,2011,7,8,4,0,1.842683


In [61]:
x = train.drop(columns=['fare_amount'])
y = train['fare_amount'].values
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42, shuffle=True)

In [62]:
model = CatBoostRegressor()

In [63]:
model.fit(X=x_train,y=y_train,eval_set=(x_val,y_val))

Learning rate set to 0.148153
0:	learn: 8.8902465	test: 8.8440728	best: 8.8440728 (0)	total: 51ms	remaining: 50.9s
1:	learn: 8.1204448	test: 8.0763391	best: 8.0763391 (1)	total: 99.8ms	remaining: 49.8s
2:	learn: 7.5185910	test: 7.4776196	best: 7.4776196 (2)	total: 149ms	remaining: 49.5s
3:	learn: 7.0342501	test: 6.9965719	best: 6.9965719 (3)	total: 202ms	remaining: 50.2s
4:	learn: 6.6426916	test: 6.6081482	best: 6.6081482 (4)	total: 252ms	remaining: 50.1s
5:	learn: 6.3381541	test: 6.3061546	best: 6.3061546 (5)	total: 299ms	remaining: 49.6s
6:	learn: 6.1014075	test: 6.0713666	best: 6.0713666 (6)	total: 344ms	remaining: 48.9s
7:	learn: 5.9159243	test: 5.8897505	best: 5.8897505 (7)	total: 394ms	remaining: 48.8s
8:	learn: 5.7732642	test: 5.7491866	best: 5.7491866 (8)	total: 441ms	remaining: 48.5s
9:	learn: 5.6650244	test: 5.6439001	best: 5.6439001 (9)	total: 486ms	remaining: 48.1s
10:	learn: 5.5749127	test: 5.5595252	best: 5.5595252 (10)	total: 532ms	remaining: 47.9s
11:	learn: 5.5056810	t

<catboost.core.CatBoostRegressor at 0x2978048aa80>

In [64]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6


In [65]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test['year'] = test['pickup_datetime'].dt.year
test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['day_of_week'] = test['pickup_datetime'].dt.dayofweek
test['hour'] = test['pickup_datetime'].dt.hour

test['distance'] = haversine_distance(test['pickup_latitude'], test['pickup_longitude'], test['dropoff_latitude'], test['dropoff_longitude'])

In [66]:
test_key = test['key']

In [67]:
test = test.drop(['pickup_datetime', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'key'], axis = 1)
test

Unnamed: 0,passenger_count,year,month,day,day_of_week,hour,distance
0,1,2015,1,27,1,13,2.323260
1,1,2015,1,27,1,13,2.425353
2,1,2011,10,8,5,11,0.618628
3,1,2012,12,1,5,21,1.961033
4,1,2012,12,1,5,21,5.387301
...,...,...,...,...,...,...,...
9909,6,2015,5,10,6,12,2.124874
9910,6,2015,1,12,0,17,3.270969
9911,6,2015,4,19,6,20,19.183941
9912,6,2015,1,31,5,1,8.343486


In [68]:
predictions = model.predict(test)


submission = pd.DataFrame({
    'key': test_key,
    'fare_amount': predictions
    })

# Сохранение результатов
submission.to_csv('taxi_fare_submission.csv', index=False)
print("Файл taxi_fare_submission.csv успешно создан.")

Файл taxi_fare_submission.csv успешно создан.
