In [138]:
# Импорт библиотек
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [139]:
# Загрузка данных 
data = pd.read_csv('taxi_trip_pricing.csv')

In [140]:
data.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [141]:
# Заполнение пропусков
for col in data.columns:
    if data[col].isnull().any():
        if data[col].dtype in ['int64', 'float64']:
          median_value = data[col].median() # Замена на медиану для численных признаков
          data[col] = data[col].fillna(median_value)
        else:
            mode_value = data[col].mode()[0]  # Замена на моду для категориальных признаков
            data[col] = data[col].fillna(mode_value)

In [142]:
data.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,3.52,0.62,0.43,40.57,50.0745
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,Clear,3.48,0.51,0.15,116.81,36.4698
4,25.83,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [129]:
# Определение признаков
categorical_features = ['Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather']
numerical_features = ['Trip_Distance_km', 'Passenger_Count', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']

In [143]:
# Разделение на обучающую и тестовую выборки
X = data.drop('Trip_Price', axis=1)
y = data['Trip_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [144]:
# Выбор категориальных колонок из обучающей выборки
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Создание и применение OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])
feature_names = encoder.get_feature_names_out(categorical_cols)


# Преобразование в DataFrame и объединение
X_train_encoded = pd.DataFrame(X_train_encoded, columns = feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns = feature_names)

X_train_num = X_train[numerical_features].reset_index(drop=True)
X_test_num = X_test[numerical_features].reset_index(drop = True)

X_train = pd.concat([X_train_num, X_train_encoded], axis = 1)
X_test = pd.concat([X_test_num, X_test_encoded], axis = 1)
X_train.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,3.28,2.0,2.88,1.76,0.2,78.04,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,47.29,1.0,2.71,1.75,0.14,87.75,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,30.92,4.0,2.47,1.31,0.23,15.73,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,2.41,2.0,3.47,0.87,0.19,31.26,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,45.83,3.0,2.5,0.57,0.48,61.86,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [145]:
# Обучение модели
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

In [146]:
# Оценка модели
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

RMSE: 7.436024100852411
R2 Score: 0.9334378258555325


In [147]:
dump(model, 'taxi_fare_model.pkl')
print("Модель сохранена как taxi_fare_model.pkl")

Модель сохранена как taxi_fare_model.pkl
