In [42]:
import pandas as pd
import numpy as np
import re

In [28]:
df = pd.read_csv('mtc_bus_list_with_route.csv')

In [29]:
df = df.dropna(how='all')

In [30]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [31]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df['bus_number'] = df['bus_number'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

In [32]:
df['via'] = df['via'].fillna('')
df['via_stops'] = df['via'].apply(lambda x: x.split(',') if x else [])
df['num_stops'] = df['via_stops'].apply(len)

In [33]:
route_types = ['high_frequency_route', 'night_service_route', 'low_frequency_route']
for col in route_types:
    df[col] = df[col].map({'x': 1, '': 0}).fillna(0).astype(int)

In [34]:
def standardize_location(location):
    if isinstance(location, str):
        # Convert to lowercase and strip spaces
        return location.strip().lower()
    else:
        return 'Unknown'

In [35]:
if 'starting_point' in df.columns and 'ending_point' in df.columns:

    df['starting_point'] = df['starting_point'].fillna('Unknown')
    df['ending_point'] = df['ending_point'].fillna('Unknown')
    df['starting_point'] = df['starting_point'].apply(standardize_location)
    df['ending_point'] = df['ending_point'].apply(standardize_location)
    print(df[['starting_point', 'ending_point']].head())
else:
    print("Either 'starting_point' or 'ending_point' columns do not exist in the DataFrame.")

  starting_point   ending_point
0  thiruvottiyur  thiruvanmiyur
1  thiruvottiyur    poonamallee
2       tollgate  saidapet west
3  ekkaduthangal       broadway
4       broadway        t.nagar


In [36]:
df = df.fillna('Unknown')

In [37]:
df['route_id'] = df.index

In [38]:
df['num_stops'] = df['num_stops'].astype(int)

In [43]:
df['rainfall_index'] = np.random.randint(0, 101, size=len(df))

In [39]:
column_order = ['route_id', 'bus_number', 'starting_point', 'ending_point', 'via', 
                'via_stops', 'num_stops', 'high_frequency_route', 'night_service_route', 
                'low_frequency_route']
df = df[column_order]

In [44]:
df.to_csv('bus_processed_output.csv', index=False)

In [40]:
print(df.head())
print(df.info())

   route_id bus_number starting_point   ending_point  \
0         0          1  thiruvottiyur  thiruvanmiyur   
1         1        101  thiruvottiyur    poonamallee   
2         2        10A       tollgate  saidapet west   
3         3        10E  ekkaduthangal       broadway   
4         4         11       broadway        t.nagar   

                                                 via  \
0  Adyar, Mylapore, Royapettah, Parry's Corner, K...   
1  Tollgate, Central, KMC, Mathuravoyal, Karayanc...   
2  Kal mandapam, Parrys, Central R.S, Egmore R.S,...   
3  Central R.S, Egmore R.S, Maternity Hospital, D...   
4  Panagal park, Vani mahal, Thousand Lights, TVS...   

                                           via_stops  num_stops  \
0  [Adyar,  Mylapore,  Royapettah,  Parry's Corne...          8   
1  [Tollgate,  Central,  KMC,  Mathuravoyal,  Kar...          5   
2  [Kal mandapam,  Parrys,  Central R.S,  Egmore ...         12   
3  [Central R.S,  Egmore R.S,  Maternity Hospital...      