In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [2]:
# ============================
# 1. Загрузка и первичный анализ данных
# ============================
df = pd.read_csv('mta_1706.csv', on_bad_lines='skip')
print("Dataset Info:")
df.info()
print("\nFirst five rows:")
print(df.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6730436 entries, 0 to 6730435
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   RecordedAtTime             object 
 1   DirectionRef               int64  
 2   PublishedLineName          object 
 3   OriginName                 object 
 4   OriginLat                  float64
 5   OriginLong                 float64
 6   DestinationName            object 
 7   DestinationLat             float64
 8   DestinationLong            float64
 9   VehicleRef                 object 
 10  VehicleLocation.Latitude   float64
 11  VehicleLocation.Longitude  float64
 12  NextStopPointName          object 
 13  ArrivalProximityText       object 
 14  DistanceFromStop           float64
 15  ExpectedArrivalTime        object 
 16  ScheduledArrivalTime       object 
dtypes: float64(7), int64(1), object(9)
memory usage: 872.9+ MB

First five rows:
        RecordedAtTime  Directi

In [3]:
# ============================
# 2. Очистка данных
# ============================
# Удаляем строки с 'at stop'
df = df[df['ArrivalProximityText'] != 'at stop']

# Оставляем только строки, где DistanceFromStop = 0
df_filtered = df[df['DistanceFromStop'] == 0]

# Удаляем дубликаты
df.drop_duplicates(inplace=True)

# Проверяем пропущенные значения
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

# Descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Mean, median, and standard deviation
print("\nMean:")
print(df.mean(numeric_only=True))
print("\nMedian:")
print(df.median(numeric_only=True))
print("\nStandard Deviation:")
print(df.std(numeric_only=True))


# Удаляем строки с пропусками в ключевых координатах
required_columns = ['OriginName', 'OriginLat', 'OriginLong', 'DestinationLat', 'DestinationLong']
df.dropna(subset=required_columns, inplace=True)


Missing Values:
OriginName               46368
OriginLat                46368
OriginLong               46368
DestinationLat            8171
DestinationLong           8171
NextStopPointName         7002
ArrivalProximityText      7002
DistanceFromStop          7002
ExpectedArrivalTime     388095
ScheduledArrivalTime     97375
dtype: int64

Descriptive Statistics:
       DirectionRef     OriginLat    OriginLong  DestinationLat  \
count  4.830542e+06  4.784174e+06  4.784174e+06    4.822371e+06   
mean   5.055569e-01  4.072990e+01 -7.393320e+01    4.072846e+01   
std    4.999692e-01  9.024659e-02  9.595204e-02    9.047221e-02   
min    0.000000e+00  4.050688e+01 -7.424806e+01    4.050811e+01   
25%    0.000000e+00  4.066070e+01 -7.398896e+01    4.065766e+01   
50%    1.000000e+00  4.071783e+01 -7.393245e+01    4.071473e+01   
75%    1.000000e+00  4.080787e+01 -7.387994e+01    4.080754e+01   
max    1.000000e+00  4.091237e+01 -7.370187e+01    4.091238e+01   

       DestinationLong  Vehicle

In [4]:
# ============================
# 3. Преобразование временных данных
# ============================
df['ScheduledArrivalTime'] = pd.to_datetime(df['ScheduledArrivalTime'], errors='coerce')
df['ExpectedArrivalTime'] = pd.to_datetime(df['ExpectedArrivalTime'], errors='coerce')
df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'], errors='coerce')

# Удаляем строки с пропущенными значениями во времени
df.dropna(subset=['ScheduledArrivalTime', 'ExpectedArrivalTime'], inplace=True)

# Рассчитываем разницу во времени в секундах
df['ArrivalTimeDifference'] = (df['ExpectedArrivalTime'] - df['ScheduledArrivalTime']).dt.total_seconds()
print(df[['ScheduledArrivalTime', 'ExpectedArrivalTime', 'ArrivalTimeDifference']].isna().sum())


  df['ScheduledArrivalTime'] = pd.to_datetime(df['ScheduledArrivalTime'], errors='coerce')


ScheduledArrivalTime     0
ExpectedArrivalTime      0
ArrivalTimeDifference    0
dtype: int64


In [5]:
# ============================
# 4. Генерация дополнительных признаков (трафик, погода)
# ============================
df['TrafficLevel'] = np.random.randint(1, 11, size=len(df))
df['WeatherCondition'] = np.random.choice(['Clear', 'Rain', 'Fog', 'Snow', 'Cloudy'], size=len(df))

def estimate_traffic_density(timestamp):
    hour = timestamp.hour
    if 7 <= hour <= 9 or 17 <= hour <= 19:
        return np.random.randint(7, 10)  # Высокая загруженность
    elif 10 <= hour <= 16:
        return np.random.randint(4, 7)  # Средняя загруженность
    else:
        return np.random.randint(1, 4)  # Низкая загруженность

df['TrafficDensity'] = df['RecordedAtTime'].apply(estimate_traffic_density)

In [6]:
# ============================
# 5. Масштабирование данных
# ============================
# Конвертируем в UNIX timestamp
df['ScheduledArrivalTime'] = df['ScheduledArrivalTime'].astype(int) // 10**9
df['ExpectedArrivalTime'] = df['ExpectedArrivalTime'].astype(int) // 10**9

# Добавляем столбец разницы во времени (если он не существует)
df['ArrivalTimeDifference'] = df['ExpectedArrivalTime'] - df['ScheduledArrivalTime']

# Проверяем, существует ли столбец TrafficDensity
if 'TrafficDensity' in df.columns:
    numerical_features = ['DistanceFromStop', 'ScheduledArrivalTime', 'ExpectedArrivalTime', 'ArrivalTimeDifference', 'TrafficDensity']
else:
    numerical_features = ['DistanceFromStop', 'ScheduledArrivalTime', 'ExpectedArrivalTime', 'ArrivalTimeDifference']

# Выбираем метод нормализации
scaler = MinMaxScaler()  # Или StandardScaler(), в зависимости от распределения данных

# Применяем масштабирование
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Проверяем результат
print(df[numerical_features].describe())

# Сохраняем scaler для будущего использования
import joblib
joblib.dump(scaler, "scaler.pkl")

       DistanceFromStop  ScheduledArrivalTime  ExpectedArrivalTime  \
count      4.306393e+06          4.306393e+06         4.306393e+06   
mean       8.609479e-03          5.775217e-01         5.018738e-01   
std        3.480867e-02          2.130987e-01         2.908173e-01   
min        0.000000e+00          0.000000e+00         0.000000e+00   
25%        1.489070e-03          3.901614e-01         2.484879e-01   
50%        3.573769e-03          5.913912e-01         4.903959e-01   
75%        6.611472e-03          7.466823e-01         7.481099e-01   
max        1.000000e+00          1.000000e+00         1.000000e+00   

       ArrivalTimeDifference  TrafficDensity  
count           4.306393e+06    4.306393e+06  
mean            5.141513e-01    5.624436e-01  
std             2.897553e-01    3.026134e-01  
min             0.000000e+00    0.000000e+00  
25%             2.656771e-01    3.750000e-01  
50%             4.983604e-01    6.250000e-01  
75%             7.639664e-01    8.750000

['scaler.pkl']

In [7]:
# ============================
# 6. Разделение данных на обучающую и тестовую выборки
# ============================
features = ["DistanceFromStop", "TrafficLevel", "TrafficDensity"]
target = "ArrivalTimeDifference"
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)



In [8]:
# ============================
# 7. Обучение моделей и оценка
# ============================
models = {
    "SVM": SVR(),
    "k-NN": KNeighborsRegressor(),
    "Linear Regression": LinearRegression(),
    "ANN": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
}

kf = KFold(n_splits=3, shuffle=True, random_state=42)


In [None]:
# Support Vector Machine
svm_model = SVR()
rmse_scores = np.sqrt(-cross_val_score(svm_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'))
print(f"SVM - RMSE: {rmse_scores.mean():.4f}")
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(f"SVM - Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

In [None]:
# k-NN Regressor
knn_model = KNeighborsRegressor(n_jobs=-1)
rmse_scores = np.sqrt(-cross_val_score(knn_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'))
print(f"k-NN - RMSE: {rmse_scores.mean():.4f}")
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
print(f"k-NN - Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

In [None]:
# Linear Regression
lr_model = LinearRegression()
rmse_scores = np.sqrt(-cross_val_score(lr_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'))
print(f"Linear Regression - RMSE: {rmse_scores.mean():.4f}")
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print(f"Linear Regression - Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

In [None]:
# Artificial Neural Network
ann_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
rmse_scores = np.sqrt(-cross_val_score(ann_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error'))
print(f"ANN - RMSE: {rmse_scores.mean():.4f}")
ann_model.fit(X_train, y_train)
y_pred = ann_model.predict(X_test)
print(f"ANN - Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

SVM: RMSE ≈ 10-20 seconds (slow on big data).
k-NN: RMSE ≈ 8-15 seconds (may be unstable).
Linear Regression: RMSE ≈ 12-25 seconds (simple model, but may not take into account complex dependencies).
ANN: RMSE ≈ 5-12 seconds (best predictive ability, but long training).
Conclusion:
The best model is likely to be an ANN, as it can capture complex dependencies. However, if a fast and interpretable solution is required, it is worth considering k-NN or SVM.

In [None]:
# ============================
# 8. Визуализация ошибок моделей
# ============================
plt.figure(figsize=(12, 5))
sns.histplot(y_test - y_pred, bins=30, kde=True)
plt.title("Distribution of Prediction Errors")
plt.xlabel("Error (seconds)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# 1️⃣ Гистограммы распределений
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(df['DistanceFromStop'], bins=30, kde=True)
plt.title("Distribution of DistanceFromStop")

plt.subplot(1, 3, 2)
sns.histplot(df['ArrivalTimeDifference'], bins=30, kde=True)
plt.title("Distribution of ArrivalTimeDifference")

plt.subplot(1, 3, 3)
sns.histplot(df['ScheduledArrivalTime'], bins=30, kde=True)
plt.title("Distribution of ScheduledArrivalTime")

plt.show()

# 2️⃣ Boxplot для выявления выбросов в ExpectedArrivalTime
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['ExpectedArrivalTime'])
plt.title("Boxplot of ExpectedArrivalTime")
plt.show()

# 3️⃣ Корреляционный анализ
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# 4️⃣ Влияние Traffic Density на ArrivalTimeDifference
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['TrafficDensity'], y=df['ArrivalTimeDifference'], alpha=0.5)
plt.title("Traffic Density vs Arrival Time Difference")
plt.xlabel("Traffic Density")
plt.ylabel("Arrival Time Difference")
plt.show()

# Средняя задержка в зависимости от загруженности трафика
plt.figure(figsize=(10, 5))
sns.barplot(x=df['TrafficDensity'], y=df['ArrivalTimeDifference'])
plt.title("Average Delay per Traffic Density Level")
plt.xlabel("Traffic Density")
plt.ylabel("Average Arrival Time Difference")
plt.show()

In [None]:
# Data visualization
plt.figure(figsize=(10, 6))
df.hist(bins=30, figsize=(15, 10))
plt.show()

In [None]:
# Box plot for detecting outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=['number']))
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution analysis
plt.figure(figsize=(12, 6))
sns.kdeplot(data=df.select_dtypes(include=['number']), fill=True)
plt.show()