In [None]:
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode
pio.renderers.default = "notebook_connected"
init_notebook_mode(connected=False)
cf.go_offline()

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

import folium

In [None]:
metro_data=pd.read_csv("Kolkata Metro Network.csv")
metro_data

In [None]:
stations_per_line = metro_data['Line'].value_counts()

# calculating the total distance of each metro line (max distance from start)
total_distance_per_line = metro_data.groupby('Line')['Distance from Start (km)'].max()

avg_distance_per_line = total_distance_per_line / (stations_per_line - 1)

line_analysis = pd.DataFrame({
    'Line': stations_per_line.index,
    'Number of Stations': stations_per_line.values,
    'Average Distance Between Stations (km)': avg_distance_per_line
})

# sorting the DataFrame by the number of stations
line_analysis = line_analysis.sort_values(by='Number of Stations', ascending=False)

line_analysis.reset_index(drop=True, inplace=True)
line_analysis

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Number of Stations Per Metro Line',
                                                    'Average Distance Between Stations Per Metro Line'),
                    horizontal_spacing=0.1)

# plot for Number of Stations per Line
fig.add_trace(
    go.Bar(y=line_analysis['Line'], x=line_analysis['Number of Stations'],
           orientation='h', name='Number of Stations', marker_color='deepskyblue'),
    row=1, col=1
)

# plot for Average Distance Between Stations
fig.add_trace(
    go.Bar(y=line_analysis['Line'], x=line_analysis['Average Distance Between Stations (km)'],
           orientation='h', name='Average Distance (km)', marker_color='darkorange'),
    row=1, col=2
)

# update xaxis properties
fig.update_xaxes(title_text="Number of Stations", row=1, col=1)
fig.update_xaxes(title_text="Average Distance Between Stations (km)", row=1, col=2)

# update yaxis properties
fig.update_yaxes(title_text="Metro Line", row=1, col=1)
fig.update_yaxes(title_text="", row=1, col=2)

# update layout
fig.update_layout(height=300, width=1000, title_text="Metro Line Analysis", template="plotly_white")

fig.show()

In [None]:
#Geospatial analysis:Visualizing metro stations on a map 
# Defining a color scheme for the metro lines in Kolkata
line_colors = {
    'Orange Line': 'orange',
    'Purple Line': 'purple',
    'Green Line': 'green',
    'Blue Line': 'blue'
}

# Initializing a Folium map centered on Kolkata
kolkata_map_with_line_tooltip = folium.Map(location=[22.5726, 88.3639], zoom_start=12)

# Adding colored markers for each metro station with line name in the tooltip
for index, row in metro_data.iterrows():
    # Skip rows with missing Latitude or Longitude
    if pd.isna(row['Latitude']) or pd.isna(row['Longitude']):
        continue
    
    # Get the line color from the dictionary
    line = row['Line']
    color = line_colors.get(line, 'black')  # Default color is black if the line is not found in the dictionary
    
    # Add the marker to the map
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"{row['Station Name']}",
        tooltip=f"{row['Station Name']}, {line}",
        icon=folium.Icon(color=color)
    ).add_to(kolkata_map_with_line_tooltip)

# Displaying the updated map
kolkata_map_with_line_tooltip

In [None]:
#Temporal Analysis:It analyze the growth of metro network over time, how many stations have been opened each year and visualize the growth.
# start by extracting the year from the Opening Date and then count the number of stations opened each year.
# Ensure 'Opening Date' is in datetime format
metro_data['Opening Date'] = pd.to_datetime(metro_data['Opening Date'], dayfirst=True, errors='coerce')

# Drop rows where 'Opening Date' is NaT (optional if you don't want to keep invalid dates)
metro_data = metro_data.dropna(subset=['Opening Date'])

# Extract the year from the 'Opening Date'
metro_data['Opening Year'] = metro_data['Opening Date'].dt.year

# Count the number of stations opened each year
stations_per_year = metro_data['Opening Year'].value_counts().sort_index()

# Create a DataFrame for visualization
stations_per_year_df = stations_per_year.reset_index()
stations_per_year_df.columns = ['Year', 'Number of Stations']

# Create a bar plot using Plotly
import plotly.express as px

fig = px.bar(
    stations_per_year_df,
    x='Year',
    y='Number of Stations',
    title="Number of Metro Stations Opened Each Year in Kolkata",
    labels={'Year': 'Year', 'Number of Stations': 'Number of Stations Opened'}
)
fig.update_layout(
    xaxis_tickangle=-45,
    xaxis=dict(tickmode='linear'),
    yaxis=dict(title='Number of Stations Opened'),
    xaxis_title="Year"
)

# Display the plot
fig.show()

In [None]:
#Line Analysis:analyze the various metro lines in terms of the no of stations and the average distance between stations.
stations_per_line = metro_data['Line'].value_counts()
# calculating the total distance of each metro line (max distance from start)
total_distance_per_line = metro_data.groupby('Line')['Distance from Start (km)'].max()
avg_distance_per_line = total_distance_per_line / (stations_per_line - 1)
line_analysis = pd.DataFrame({
    'Line': stations_per_line.index,
    'Number of Stations': stations_per_line.values,
    'Average Distance Between Stations (km)': avg_distance_per_line
})
# sorting the DataFrame by the number of stations
line_analysis = line_analysis.sort_values(by='Number of Stations', ascending=False)
line_analysis.reset_index(drop=True, inplace=True)
print(line_analysis)

In [None]:
# creating subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Number of Stations Per Metro Line',
                                                    'Average Distance Between Stations Per Metro Line'),
                    horizontal_spacing=0.2)

# plot for Number of Stations per Line
fig.add_trace(
    go.Bar(y=line_analysis['Line'], x=line_analysis['Number of Stations'],
           orientation='h', name='Number of Stations', marker_color='crimson'),
    row=1, col=1
)

# plot for Average Distance Between Stations
fig.add_trace(
    go.Bar(y=line_analysis['Line'], x=line_analysis['Average Distance Between Stations (km)'],
           orientation='h', name='Average Distance (km)', marker_color='navy'),
    row=1, col=2
)

# update xaxis properties
fig.update_xaxes(title_text="Number of Stations", row=1, col=1)
fig.update_xaxes(title_text="Average Distance Between Stations (km)", row=1, col=2)

# update yaxis properties
fig.update_yaxes(title_text="Metro Line", row=1, col=1)
fig.update_yaxes(title_text="", row=1, col=2)

# update layout
fig.update_layout(height=300, width=1000, title_text="Metro Line Analysis", template="plotly_white")

fig.show()

In [None]:
#Station Layout Analysis: Analyze the distribution of stations layout across the network

layout_counts = metro_data['Station Layout'].value_counts()

# creating the bar plot using Plotly
fig = px.bar(x=layout_counts.index, y=layout_counts.values,
             labels={'x': 'Station Layout', 'y': 'Number of Stations'},
             title='Distribution of Kolkata Metro Station Layouts',
             color=layout_counts.index,
             color_continuous_scale='pastel')

# updating layout for better presentation
fig.update_layout(xaxis_title="Station Layout",
                  yaxis_title="Number of Stations",
                  coloraxis_showscale=False,
                  template="plotly_white")

fig.show()

In [None]:
import pandas as pd

# Example DataFrame
data = {'Line': ['BlueLine', 'GreenLine', 'PurpleLine', 'BlueLine', 'OrangeLine']}
metro_data = pd.DataFrame(data)

# Convert 'Line' column to lowercase (using .loc to avoid the SettingWithCopyWarning)
metro_data.loc[:, 'Line'] = metro_data['Line'].str.lower()

# Display unique metro lines
metro_lines = metro_data['Line'].unique()
print(metro_lines)


In [None]:
# Convert metro_lines (NumPy array) into a pandas Series to group
metro_lines_series = pd.Series(metro_lines)

# Group and count the metro lines
metro_line_counts = metro_lines_series.value_counts().reset_index(name='Metro_Line_Count')
metro_line_counts.columns = ['Metro Line', 'Metro_Line_Count']

# Display the result
print(metro_line_counts)

In [None]:
import pandas as pd
import plotly.express as px

# Example data: Let's assume metro_lines contains unique metro line names
metro_lines = ['Blue Line', 'Green Line', 'Purple Line', 'Orange Line', 'Blue Line', 'Purple Line']

# Step 1: Create a DataFrame with metro line counts
metro_line_counts = pd.DataFrame(metro_lines, columns=['Metro Line'])
metro_line_counts = metro_line_counts['Metro Line'].value_counts().reset_index(name='Metro_Line_Count')
metro_line_counts.columns = ['Metro Line', 'Metro_Line_Count']

# Step 2: Create the bar chart using Plotly Express
fig = px.bar(metro_line_counts, 
             x='Metro Line',  # Metro line names on the x-axis
             y='Metro_Line_Count',  # Metro line counts on the y-axis
             title='Count Of Number Of Metro Station On Each Line',  # Title of the chart
             color='Metro Line')  # Color bars by metro line names

# Step 3: Display the figure
fig.show()

In [None]:
import matplotlib.pyplot as plt
# Create a dictionary to map Metro Line names to specific colors
line_colors = {
    'Orange Line': 'orange',
    'Purple Line': 'purple',
    'Green Line': 'green',
    'Blue Line': 'blue'
}

# Calculate the count of each Metro Line from the 'Line' column
line_counts = metro_data['Line'].value_counts()

# Create a pie chart using the calculated counts and assigned colors
plt.figure(figsize=(8, 8))
plt.pie(
    line_counts,
    labels=line_counts.index,
    autopct='%1.1f%%',
    startangle=4  # Start angle for better visualization
)

# Set aspect ratio to ensure the pie is circular
plt.axis('equal')

# Add a title to the chart
plt.title('Metro Line Distribution')

# Display the pie chart
plt.show()

In [None]:
metro_data['Line'].value_counts().plot(kind='bar')
plt.xlabel('Line')
plt.ylabel('Number of Stations')
plt.title('Number of Stations per Metro Line')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Passenger Flow Prediction (Time Series Analysis)

In [None]:
import pandas as pd

# Load passenger data
passenger_data = pd.read_csv("Kolkata_metro_passanger_flow.csv")

# Display first few rows
print(passenger_data.head())


In [None]:
busiest_stations = passenger_data.groupby("Station Name")["Passenger Count"].sum().sort_values(ascending=False)

print("Top 5 Busiest Metro Stations:")
print(busiest_stations.head(5))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.barplot(x="Line", y="Passenger Count", data=passenger_data, estimator=sum, errorbar=None)
plt.title("Total Passenger Count per Metro Line")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x="Time of Day", y="Passenger Count", data=passenger_data)
plt.title("Metro Passenger Count at Different Times of Day")
plt.show()

In [None]:
import pandas as pd
import folium

# Station locations dictionary
station_locations = {
    "Kavi Subhash": (22.471, 88.3773), "Shahid Khudiram": (22.4673, 88.3685), "Kavi Nazrul": (22.4617, 88.3619),
    "Gitanjali": (22.4515, 88.3525), "Masterda Surya Sen": (22.4392, 88.3412), "Netaji": (22.427, 88.33),
    "Mahanayak Uttam Kumar": (22.4148, 88.319), "Rabindra Sarobar": (22.4036, 88.308), "Kalighat": (22.3945, 88.301),
    "Jatin Das Park": (22.386, 88.295), "Netaji Bhavan": (22.377, 88.288), "Rabindra Sadan": (22.37, 88.283),
    "Maidan": (22.362, 88.278), "Park Street": (22.356, 88.274), "Esplanade": (22.572, 88.325),
    "Chandni Chowk": (22.344, 88.266), "Central": (22.338, 88.262), "Mahatma Gandhi Road": (22.332, 88.258),
    "Girish Park": (22.326, 88.254), "Shobhabazar Sutanuti": (22.32, 88.25), "Shyambazar": (22.314, 88.246),
    "Belgachia": (22.308, 88.242), "Dum Dum": (22.3, 88.238), "Noapara": (22.294, 88.232),
    "Dakshineswar": (22.29, 88.22), "Baranagar": (22.284, 88.215), "Howrah Maidan": (22.59, 88.34),
    "Howrah": (22.586, 88.337), "Mahakaran": (22.58, 88.33), "Sealdah": (22.5664, 88.3639),
    "Sector V": (22.573, 88.4273), "Karunamoyee": (22.5792, 88.4206), "Central Park": (22.5821, 88.413),
    "City Centre": (22.585, 88.4051), "Bidhannagar": (22.5895, 88.3928), "Phoolbagan": (22.5941, 88.3798),
    "Joka": (22.4467, 88.3156), "Thakurpukur": (22.4544, 88.311), "Sakherbazar": (22.4623, 88.3058),
    "Behala Chowrasta": (22.4692, 88.301), "Behala Bazar": (22.4756, 88.296), "Majerhat": (22.5165, 88.2958),
    "New Garia": (22.4617, 88.3619), "Kalikapur": (22.4585, 88.352), "Hemanta Mukherjee": (22.456, 88.3465)
}

# Load dataset
passenger_data = pd.read_csv("Kolkata_metro_passanger_flow.csv")

# Merge lat/lon into passenger_data
station_df = pd.DataFrame.from_dict(station_locations, orient="index", columns=["Latitude", "Longitude"]).reset_index()
station_df.rename(columns={"index": "Station Name"}, inplace=True)
passenger_data = passenger_data.merge(station_df, on="Station Name", how="left")

# Remove rows with missing lat/lon values
passenger_data = passenger_data.dropna(subset=["Latitude", "Longitude"])

# Create a metro map
metro_map = folium.Map(location=[22.5726, 88.3639], zoom_start=12)

# Add stations to the map
for _, row in passenger_data.iterrows():
    folium.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=row["Passenger Count"] / 10000,
        popup=f"{row['Station Name']}: {row['Passenger Count']} passengers",
        color="blue", fill=True, fill_color="blue"
    ).add_to(metro_map)

# Show map
metro_map

In [None]:
#creating realistic synthetic dataset
import pandas as pd
import numpy as np

# Generate dates
date_range = pd.date_range(start="2023-01-01", periods=365, freq='D')

# Metro stations list
stations = ["Dum Dum", "Esplanade", "Sealdah", "Kavi Subhash", "Howrah Maidan", "Salt Lake Sector V"]

# Generate synthetic data
data = []
for date in date_range:
    for station in stations:
        passengers = np.random.randint(40000, 120000)  # Daily passengers per station
        data.append([date, station, passengers])

# Create DataFrame
df = pd.DataFrame(data, columns=["Date", "Station", "Passenger Count"])
df.to_csv("metro_passengers.csv", index=False)

print("Synthetic dataset created!")

In [None]:
#Loading & visualizing the data
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("metro_passengers.csv", parse_dates=["Date"])

# Plot passenger trends
plt.figure(figsize=(12, 5))
sns.lineplot(data=df.groupby("Date")["Passenger Count"].sum().reset_index(), x="Date", y="Passenger Count")
plt.title("Kolkata Metro Daily Passenger Trends (Synthetic Data)")
plt.xlabel("Date")
plt.ylabel("Total Passengers")
plt.show()


In [None]:
#What is ARIMA?
'''ARIMA (AutoRegressive Integrated Moving Average) is a powerful time series forecasting model used for predicting ridership.'''
from statsmodels.tsa.arima.model import ARIMA

# Aggregate passenger data
df_daily = df.groupby("Date")["Passenger Count"].sum().reset_index()

# Train ARIMA model
model = ARIMA(df_daily["Passenger Count"], order=(5,1,0))
model_fit = model.fit()

# Predict future ridership
future_steps = 30  # Predict next 30 days
forecast = model_fit.forecast(steps=future_steps)

# Plot prediction
plt.figure(figsize=(12, 5))
plt.plot(df_daily["Date"], df_daily["Passenger Count"], label="Historical Data")
plt.plot(pd.date_range(start=df_daily["Date"].max(), periods=future_steps, freq='D'), forecast, label="Predicted Data", linestyle="dashed")
plt.legend()
plt.title("Kolkata Metro Passenger Forecast (Next 30 Days)")
plt.xlabel("Date")
plt.ylabel("Passenger Count")
plt.show()

In [None]:
#Network Graph Analysis (Graph Theory)
import networkx as nx

G = nx.Graph()
for index, row in metro_data.iterrows():
    G.add_node(row["Station Name"], line=row["Line"])

# Add edges (connect stations based on metro lines)
for i in range(len(metro_data) - 1):
    G.add_edge(metro_data.loc[i, "Station Name"], metro_data.loc[i + 1, "Station Name"])

# Find shortest path between two stations
nx.shortest_path(G, source="Dum Dum", target="Kavi Subhash", weight="Distance from Start (km)")

In [None]:
#Peak Hour Congestion Analysis
import seaborn as sns
import matplotlib.pyplot as plt

# Example: Heatmap of station congestion at different hours
data = pd.DataFrame({ 'Station': ['Dum Dum', 'Esplanade', 'Garia'], '8 AM': [2000, 5000, 1500], '6 PM': [2500, 6000, 1800] })
sns.heatmap(data.set_index('Station'), cmap="Reds", annot=True)
plt.show()


In [None]:
#Metro Revenue Prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# 📊 Example dataset: Ridership vs Revenue
data = pd.DataFrame({
    'Ridership': [1000, 5000, 10000, 15000, 20000, 25000, 30000, 35000],
    'Revenue': [20000, 100000, 200000, 300000, 400000, 500000, 600000, 700000]
})

# 🚆 Feature (X) and Target (y)
X = data[['Ridership']]
y = data['Revenue']

# 🎯 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔥 Train RandomForest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 📈 Predict future revenue for given ridership values
future_ridership = np.array([40000, 45000, 50000]).reshape(-1, 1)
predicted_revenue = model.predict(future_ridership)

# 📊 Plot the results
plt.figure(figsize=(10, 5))
plt.scatter(X, y, color='blue', label="Actual Revenue (Training Data)")
plt.plot(future_ridership, predicted_revenue, color='red', linestyle="dashed", marker="o", label="Predicted Revenue")
plt.xlabel("Ridership (No. of Passengers)")
plt.ylabel("Revenue (₹)")
plt.title("Kolkata Metro Revenue Prediction 🚇")
plt.legend()
plt.grid(True)
plt.show()

# 📝 Print Predictions
for ridership, revenue in zip(future_ridership.flatten(), predicted_revenue):
    print(f"Predicted Revenue for {ridership} passengers: ₹{round(revenue, 2)}")


In [None]:
#Metro Delay Prediction
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Example dataset: Features (Weather, Maintenance) → Target (Delay)
data = pd.DataFrame({
    'Weather': [1, 0, 1, 0],  # 1 = Bad weather, 0 = Good weather
    'Maintenance': [0, 1, 0, 1],  # 1 = Under maintenance, 0 = No maintenance
    'Delayed': [1, 1, 0, 0]  # 1 = Delayed, 0 = Not Delayed
})

X = data[['Weather', 'Maintenance']]
y = data['Delayed']

# Train a model with a fixed random state for reproducibility
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Predict delay for a scenario (1 = Weather issue, 0 = No maintenance)
prediction = model.predict([[1, 0]])

# Output result
print(f"Predicted Delay: {'Yes (1)' if prediction[0] == 1 else 'No (0)'}")