In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load raw dataset
df = pd.read_csv("dataset.csv", delimiter=";", on_bad_lines="skip")

# Step 1: Drop columns with excessive missing values
threshold_missing = 0.98
missing_ratio = df.isnull().mean()
df = df.loc[:, missing_ratio < threshold_missing]



In [16]:
# Step 2: Drop rows missing essential fields
df = df.dropna(subset=["Date", "Departure station", "Arrival station"])

In [17]:
# Step 3: Convert Date to datetime and extract useful time features
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"])
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month

In [18]:
# Step 4: Fill missing numeric values with 0
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

In [19]:
# Save the cleaned dataset
df['Departure station'] = df['Departure station'].str.upper().str.replace(r"[^A-Z\\s]", "", regex=True).str.strip()
df.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved as cleaned_dataset.csv")

Cleaned dataset saved as cleaned_dataset.csv


In [20]:
# Generate Plot 1: Arrival Delay Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Average delay of all trains at arrival'], bins=50, kde=True)
plt.title('Distribution of Average Arrival Delays')
plt.xlabel('Average Delay (minutes)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("arrival_delay_distribution.png")
plt.close()

In [21]:
# Generate Plot 2: Departure Delay by Top Stations
top_stations = df['Departure station'].value_counts().head(10).index
station_delays = df[df['Departure station'].isin(top_stations)]

plt.figure(figsize=(12, 6))
sns.boxplot(data=station_delays, x='Departure station', y='Average delay of all trains at departure')
plt.xticks(rotation=45)
plt.title('Departure Delay by Station')
plt.tight_layout()
plt.savefig("departure_delay_by_station.png")
plt.close()

In [23]:
# Generate Plot 3: Correlation Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(df[numeric_cols].corr(), annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap of Delay Factors')
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.close()

print("\nEDA plots saved: arrival_delay_distribution.png, departure_delay_by_station.png, correlation_heatmap.png")



EDA plots saved: arrival_delay_distribution.png, departure_delay_by_station.png, correlation_heatmap.png
