In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import networkx as nx
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load data
df_stations = pd.read_csv('mrt.csv')
df_od_am = pd.read_csv('od_pm_peak.csv')
# df_od_am = pd.read_csv('od_pm_peak.csv')

# Bersihkan ID
df_stations['STN_NO'] = df_stations['STN_NO'].str.strip()
df_od_am.columns = df_od_am.columns.str.strip()
df_od_am['ORIGIN_PT_CODE'] = df_od_am['ORIGIN_PT_CODE'].str.strip()

# Transform OD matrix ke long format
df_od_long = df_od_am.melt(id_vars='ORIGIN_PT_CODE', var_name='destination', value_name='volume')
df_od_long.rename(columns={'ORIGIN_PT_CODE': 'origin'}, inplace=True)
df_od_long = df_od_long[df_od_long['volume'] > 0]

FileNotFoundError: [Errno 2] No such file or directory: 'mrt.csv'

In [3]:
G = nx.Graph()

# Buat kolom berisi daftar sub-ID
df_stations['sub_ids'] = df_stations['STN_NO'].apply(lambda x: [s.strip() for s in x.split('/')])

# Tambahkan node per sub-ID
for _, row in df_stations.iterrows():
    for sid in row['sub_ids']:
        G.add_node(sid, lat=row['Latitude'], lon=row['Longitude'], line=row['COLOR'])

# Tambahkan edge antar sub-ID jika interchange
for _, row in df_stations.iterrows():
    if len(row['sub_ids']) > 1:
        subs = row['sub_ids']
        for i in range(len(subs)):
            for j in range(i + 1, len(subs)):
                G.add_edge(subs[i], subs[j], weight=0.01)  # Virtual interchange connection

In [4]:
# Gabungkan koordinat
station_coords = df_stations.set_index('STN_NO')[['Latitude', 'Longitude']].to_dict('index')

def calc_distance(row):
    o = station_coords.get(row['origin'])
    d = station_coords.get(row['destination'])
    if o and d:
        return geodesic((o['Latitude'], o['Longitude']), (d['Latitude'], d['Longitude'])).km
    else:
        return np.nan

df_od_long['distance_km'] = df_od_long.apply(calc_distance, axis=1)
df_od_long = df_od_long.dropna()

In [5]:
# Threshold: volume tinggi dan jarak jauh
threshold_vol = df_od_long['volume'].quantile(0.90)
threshold_dist = 8  # km

df_candidates = df_od_long[
    (df_od_long['volume'] > threshold_vol) &
    (df_od_long['distance_km'] > threshold_dist)
].copy()

In [17]:
G = nx.Graph()

# # Tambahkan node
for _, row in df_stations.iterrows():
    G.add_node(row['STN_NO'], lat=row['Latitude'], lon=row['Longitude'], line=row['COLOR'])

# Tambahkan edge jika ada field adjacency (jika tidak, ini bisa dikustom manual)
# Contoh: tambahkan edge antar stasiun berdasarkan urutan nama jalur
# df_stations['sub_ids'] = df_stations['STN_NO'].apply(lambda x: [s.strip() for s in x.split('/')])
lines = df_stations['COLOR'].unique()

# G = nx.Graph()

# Tambahkan semua sub-ID sebagai node
for _, row in df_stations.iterrows():
    for sid in row['sub_ids']:
        G.add_node(sid, lat=row['Latitude'], lon=row['Longitude'], line=row['COLOR'])

# Tambahkan edge antar sub-id jika satu stasiun memiliki lebih dari satu ID (interchange)
for _, row in df_stations.iterrows():
    sub_ids = row['sub_ids']
    if len(sub_ids) > 1:
        for i in range(len(sub_ids)):
            for j in range(i+1, len(sub_ids)):
                G.add_edge(sub_ids[i], sub_ids[j], weight=0.01)  # edge virtual interchange

for line in df_stations['COLOR'].unique():
    subset = df_stations[df_stations['COLOR'] == line].sort_values(by='STN_NAME')
    station_lists = subset['sub_ids'].tolist()
    for i in range(len(station_lists)-1):
        for a in station_lists[i]:
            for b in station_lists[i+1]:
                G.add_edge(a, b, weight=1.0)

centrality = nx.betweenness_centrality(G, weight='weight')
closeness = nx.closeness_centrality(G, distance='weight')
nx.set_node_attributes(G, centrality, 'betweenness')
nx.set_node_attributes(G, closeness, 'closeness')

df_centrality = pd.DataFrame({
    'station_id': list(centrality.keys()),
    'betweenness': list(centrality.values()),
    'closeness': [closeness[k] for k in centrality.keys()]
})

# Top 5
top_betweenness = df_centrality.sort_values(by='betweenness', ascending=False).head(5)
top_closeness = df_centrality.sort_values(by='closeness', ascending=False).head(5)

print("Top 5 Betweenness Centrality:")
print(top_betweenness)

print("\nTop 5 Closeness Centrality:")
print(top_closeness)

Top 5 Betweenness Centrality:
   station_id  betweenness  closeness
91        EW6     0.017277   0.021984
80       EW26     0.017277   0.021984
65       EW11     0.017195   0.021857
64       EW10     0.017195   0.021857
63        EW1     0.016454   0.021129

Top 5 Closeness Centrality:
    station_id  betweenness  closeness
99        NE13     0.003702   0.023674
207        NE7     0.001645   0.023670
208       DT12     0.001645   0.023670
216       NS27     0.004319   0.023270
217        CE2     0.004319   0.023270


In [28]:
import pandas as pd
import networkx as nx

# Memuat data dari file CSV
df = pd.read_csv('od_pm_peak.csv')

# Menampilkan 5 baris pertama dan informasi kolom untuk memahami struktur data
print("5 baris pertama dari dataframe:")
print(df.head())

print("\nInformasi dataframe:")
print(df.info())

# Membuat Directed Graph (graf berarah)
G = nx.DiGraph()

# Mengumpulkan nama-nama node asal (origin) dari kolom 'ORIGIN_PT_CODE'
origin_nodes = df['ORIGIN_PT_CODE'].tolist()

# Mengumpulkan nama-nama node tujuan (destination) dari semua kolom lainnya
destination_columns = df.columns[1:]

# Menambahkan semua node ke graf
G.add_nodes_from(origin_nodes)
G.add_nodes_from(destination_columns)

# Menambahkan edge (tepi/sisi) ke graf dengan bobot
# Setiap sel (origin, destination) dengan nilai > 0 dianggap sebagai edge
for index, row in df.iterrows():
    origin = row['ORIGIN_PT_CODE']
    for destination_col in destination_columns:
        weight = row[destination_col]
        if weight > 0:
            G.add_edge(origin, destination_col, weight=weight)

# Menghitung Betweenness Centrality
# Bobot edge digunakan dalam perhitungan. Edge dengan bobot lebih tinggi akan memiliki dampak lebih besar.
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')

# Menghitung Closeness Centrality
# Closeness centrality dihitung pada komponen yang sangat terhubung (strongly connected component).
# Jika graf tidak sangat terhubung, maka akan dihitung pada komponen sangat terhubung terbesar.
if not nx.is_strongly_connected(G):
    print("\nGraf tidak sangat terhubung. Menghitung closeness centrality pada komponen sangat terhubung terbesar.")
    scc = list(nx.strongly_connected_components(G))
    if scc:
        largest_scc = G.subgraph(max(scc, key=len))
        closeness_centrality = nx.closeness_centrality(largest_scc, distance='weight')
    else:
        closeness_centrality = {} # Tidak ada SCC yang ditemukan
else:
    closeness_centrality = nx.closeness_centrality(G, distance='weight')

# Mengubah hasil ke DataFrame untuk tampilan yang lebih baik
betweenness_df = pd.DataFrame(betweenness_centrality.items(), columns=['Node', 'Betweenness Centrality'])
closeness_df = pd.DataFrame(closeness_centrality.items(), columns=['Node', 'Closeness Centrality'])

# Mengurutkan hasil dari yang tertinggi ke terendah
betweenness_df = betweenness_df.sort_values(by='Betweenness Centrality', ascending=False)
closeness_df = closeness_df.sort_values(by='Closeness Centrality', ascending=False)

# Menampilkan 10 node teratas berdasarkan Betweenness Centrality
print("\n10 Node Teratas Berdasarkan Betweenness Centrality:")
print(betweenness_df.head(5))

# Menampilkan 10 node teratas berdasarkan Closeness Centrality
print("\n10 Node Teratas Berdasarkan Closeness Centrality:")
print(closeness_df.head(5))

5 baris pertama dari dataframe:
  ORIGIN_PT_CODE      BP10      BP11      BP12      BP13       BP2       BP3  \
0           BP10  0.000000  0.049985  0.046155  0.030837  0.026806  0.028016   
1           BP11  0.035133  0.000000  0.026646  0.024771  0.022205  0.023882   
2           BP12  0.044909  0.030683  0.000000  0.020177  0.027243  0.031427   
3           BP13  0.061707  0.054491  0.028116  0.000000  0.030356  0.032346   
4            BP2  0.032347  0.024635  0.020951  0.012202  0.000000  0.038103   

        BP4       BP5   BP6/DT1  ...      TE26      TE27      TE28      TE29  \
0  0.013907  0.038194  0.331150  ...  0.000000  0.000000  0.000000  0.000101   
1  0.022797  0.063061  0.375308  ...  0.000099  0.000000  0.000000  0.000000   
2  0.024826  0.044258  0.400186  ...  0.000000  0.000000  0.000000  0.000000   
3  0.032595  0.019905  0.276935  ...  0.000000  0.000000  0.000000  0.000000   
4  0.019339  0.039599  0.270749  ...  0.000230  0.000115  0.000115  0.000000   

      

In [8]:
inflow = df_od_long.groupby('destination')['volume'].sum()
outflow = df_od_long.groupby('origin')['volume'].sum()

df_stations['inflow'] = df_stations['STN_NO'].map(inflow).fillna(0)
df_stations['outflow'] = df_stations['STN_NO'].map(outflow).fillna(0)
df_stations['betweenness'] = df_stations['STN_NO'].map(centrality).fillna(0)
df_stations['closeness'] = df_stations['STN_NO'].map(closeness).fillna(0)

In [9]:
X = df_stations[['Latitude', 'Longitude', 'inflow', 'outflow', 'betweenness', 'closeness']]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=5, random_state=42)
df_stations['cluster'] = kmeans.fit_predict(X_scaled)

In [10]:
cluster_map = df_stations.set_index('STN_NO')['cluster'].to_dict()
df_od_long['cluster_o'] = df_od_long['origin'].map(cluster_map)
df_od_long['cluster_d'] = df_od_long['destination'].map(cluster_map)

df_interzone = df_od_long[df_od_long['cluster_o'] != df_od_long['cluster_d']]
df_corridors = df_interzone.groupby(['cluster_o', 'cluster_d'])['volume'].sum().reset_index()

In [19]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 1. Muat dataset
df = pd.read_csv('od_am_peak.csv')

# Simpan ORIGIN_PT_CODE secara terpisah sebelum menghapus untuk pengelompokan
origin_codes = df['ORIGIN_PT_CODE']

# Hapus kolom 'ORIGIN_PT_CODE' karena bukan fitur numerik untuk pengelompokan
X = df.drop('ORIGIN_PT_CODE', axis=1)

# Standarisasi data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Tentukan jumlah klaster optimal (K) menggunakan Metode Elbow
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Berdasarkan plot metode elbow dari eksekusi sebelumnya, K=4 dipilih sebagai optimal.
# 3. Terapkan pengelompokan K-Means dengan K=4
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Tambahkan label klaster ke DataFrame asli
df['Cluster'] = cluster_labels

# 4. Tampilkan data yang dikelompokkan
clustered_stations = df[['ORIGIN_PT_CODE', 'Cluster']]
print("\nStasiun yang Dikaster (5 baris pertama):")
print(clustered_stations.head())

# Simpan stasiun yang dikelompokkan ke file CSV
clustered_stations.to_csv('clustered_stations.csv', index=False)

# Analisis karakteristik setiap klaster untuk rekomendasi rute
# Kelompokkan berdasarkan klaster dan hitung rata-rata fitur numerik (titik tujuan)
# Kita perlu mengecualikan 'ORIGIN_PT_CODE' dari perhitungan rata-rata ini, karena itu adalah tipe objek.
# Kita akan menggunakan X asli (fitur numerik) dan menambahkan label klaster ke dalamnya.
X_clustered = X.copy()
X_clustered['Cluster'] = cluster_labels
cluster_centers = X_clustered.groupby('Cluster').mean()

num_top_destinations = 5
route_recommendations = {}

for cluster_id in sorted(df['Cluster'].unique()):
    # Dapatkan tujuan (kolom tidak termasuk 'Cluster' dari cluster_centers)
    cluster_data = cluster_centers.loc[cluster_id]
    top_destinations = cluster_data.nlargest(num_top_destinations)
    route_recommendations[f'Cluster {cluster_id}'] = top_destinations.index.tolist()

print("\nRekomendasi Rute per Klaster:")
for cluster, destinations in route_recommendations.items():
    print(f"{cluster}: {', '.join(destinations)}")


Stasiun yang Dikaster (5 baris pertama):
  ORIGIN_PT_CODE  Cluster
0           BP10        2
1           BP11        2
2           BP12        2
3           BP13        2
4            BP2        2

Rekomendasi Rute per Klaster:
Cluster 0: NE1/CC29, EW14/NS26, DT17, CC24, NE17/PTC
Cluster 1: DT17, DT18, CG1/DT35, NE4/DT19, EW2/DT32
Cluster 2: EW14/NS26, BP6/DT1, EW24/NS1, EW15, EW27
Cluster 3: TE19, NS21/DT11, EW14/NS26, DT17, TE14/NS22


In [25]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 1. Muat dataset
df = pd.read_csv('od_pm_peak.csv')

# Simpan 'ORIGIN_PT_CODE' sebelum dihapus dari data yang akan di-cluster
origin_pt_codes = df['ORIGIN_PT_CODE']

# Drop kolom 'ORIGIN_PT_CODE' karena bukan fitur numerik untuk clustering
X = df.drop('ORIGIN_PT_CODE', axis=1)

# Standarisasi data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Terapkan K-Means clustering (menggunakan K=4 seperti yang ditentukan sebelumnya dari metode elbow)
# Karena sudah ada hasil plot elbow method sebelumnya, kita langsung pakai K=4
kmeans = KMeans(n_clusters=7, init='k-means++', random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Tambahkan label klaster ke DataFrame asli
df['Cluster'] = cluster_labels

# 3. Format data klaster agar sesuai dengan gambar
# Buat list untuk menyimpan data yang diformat
clustered_data_list = []

# Iterasi melalui setiap klaster
for cluster_id in sorted(df['Cluster'].unique()):
    # Dapatkan semua ORIGIN_PT_CODE untuk klaster saat ini
    stations_in_cluster = df[df['Cluster'] == cluster_id]['ORIGIN_PT_CODE'].tolist()
    # Gabungkan nama stasiun dengan koma
    stations_str = ','.join(stations_in_cluster)
    # Tambahkan ke list dengan format {Cluster ID: ..., Stations: ...}
    clustered_data_list.append({'Cluster ID': cluster_id, 'Stations': stations_str})

# Buat DataFrame dari list yang diformat
clustered_stations_df = pd.DataFrame(clustered_data_list)

# 4. Tampilkan DataFrame hasil pengelompokan
print("Data Stasiun yang Dikaster (K-Means):")
print(clustered_stations_df.to_string(index=False)) # Menggunakan to_string(index=False) untuk mencetak tanpa indeks DataFrame Pandas

# Anda juga bisa menyimpannya ke CSV jika diperlukan
# clustered_stations_df.to_csv('clustered_stations_kmeans_formatted.csv', index=False

Data Stasiun yang Dikaster (K-Means):
 Cluster ID                                                                                                                                                                                                                                                                                                                                            Stations
          0 CC10/DT26,CC11,CC12,CC14,CC16,CC2,CC20,CC21,CC23,CC24,CC25,CC26,CC27,CC28,CC3,CC5,CC6,CC7,CC8,CG2,EW1,EW10,EW11,EW12/DT14,EW14/NS26,EW15,EW16/NE3/TE17,EW21/CC22,EW3,EW4,EW5,EW6,EW7,EW8/CC9,EW9,NE1/CC29,NE4/DT19,NS10,NS11,NS12,NS13,NS14,NS15,NS16,NS17/CC15,NS18,NS19,NS20,NS23,NS24/NE6/CC1,NS25/EW13,NS27/CE2/TE20,NS28,NS8,NS9/TE2,TE14/NS22
          1                                                                                                                                                          NE10,NE11,NE12/CC13,NE13,NE14,NE15,NE16/STC,NE17/PTC,NE18,NE5,NE7/DT12,NE8,NE9,PE1,PE2,PE3,PE

In [57]:
import pandas as pd
from geopy.distance import geodesic

# Load data
df_station = pd.read_csv('mrt.csv')
df_od_matrix = pd.read_csv('od_am_peak.csv')

# Bersihkan data
df_station['STN_NO'] = df_station['STN_NO'].str.strip()
df_od_matrix.columns = df_od_matrix.columns.str.strip()

# Buat posisi stasiun
station_pos = {
    row['STN_NO']: (row['Latitude'], row['Longitude'])
    for _, row in df_station.iterrows()
}

# Transform OD matrix ke long format
df_od_long = df_od_matrix.melt(id_vars='ORIGIN_PT_CODE', var_name='DESTINATION_PT_CODE', value_name='volume')
df_od_long.columns = ['origin', 'destination', 'volume']
df_od_long = df_od_long[df_od_long['volume'].astype(float) > 0]

# Hitung jarak
def compute_distance(row):
    try:
        return geodesic(station_pos[row['origin']], station_pos[row['destination']]).km
    except:
        return None

df_od_long['distance_km'] = df_od_long.apply(compute_distance, axis=1)
df_od_long.dropna(subset=['distance_km'], inplace=True)

# Threshold kandidat ekspres
volume_threshold = df_od_long['volume'].astype(float).quantile(0.95)
distance_threshold = df_od_long['distance_km'].quantile(0.75)

# Filter
df_express = df_od_long[
    (df_od_long['volume'].astype(float) >= volume_threshold) &
    (df_od_long['distance_km'] >= distance_threshold)
]

# Tampilkan kandidat jalur ekspres
print(df_express.sort_values(by='volume', ascending=False).head(10))

         origin destination    volume  distance_km
12073       EW1        EW15  0.067250    15.688308
7106    BP6/DT1        DT17  0.066644    14.979302
11940      NS12   EW14/NS26  0.063671    17.744554
11941      NS13   EW14/NS26  0.054048    16.172449
11908      EW26   EW14/NS26  0.052564    15.970833
11942      NS14   EW14/NS26  0.051001    14.877752
23562      EW31         NS7  0.045901    17.057242
6677       NS28         CG2  0.045652    16.892632
1490   CE1/DT16     BP6/DT1  0.044259    15.268505
11684       CG2   EW12/DT14  0.044133    16.042205
