### PREDICTING RIDERSHIP ON THE NEW YORK CITY SUBWAY

### Loading the Dataset

In [1]:
import pandas as pd
import requests


#hourly ridership data
hourly_ridership_api_endpoint = 'https://data.ny.gov/resource/wujg-7c2s.csv'
response = requests.get(hourly_ridership_api_endpoint)
with open('hourly_ridership_data.csv', 'wb') as file:
    file.write(response.content)


#customer journey focused 2015-2019
journey_specific_api_endpoint = 'https://data.ny.gov/resource/r7qk-6tcy.csv'
response = requests.get(journey_specific_api_endpoint)
with open('journeyspecific_data_2015-19.csv', 'wb') as file:
    file.write(response.content)

#customer journey focused 2019-Present
journey_specific_api_endpoint = 'https://data.ny.gov/resource/4apg-4kt9.csv'
response = requests.get(journey_specific_api_endpoint)
with open('journeyspecific_data_2019-Present.csv', 'wb') as file:
    file.write(response.content)


hourly_ridership_data = pd.read_csv('hourly_ridership_data.csv')
journey_specific_data_2015_19 =pd.read_csv('journeyspecific_data_2015-19.csv')
journey_specific_data_2019_Present = pd.read_csv('journeyspecific_data_2019-Present.csv')

In [2]:
hourly_ridership_data.head()
print(hourly_ridership_data.info())
print("Checking Null Values ... \n ----- \n", hourly_ridership_data.isnull().sum())
print('Column Names : ', hourly_ridership_data.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   transit_timestamp    1000 non-null   object 
 1   transit_mode         1000 non-null   object 
 2   station_complex_id   1000 non-null   int64  
 3   station_complex      1000 non-null   object 
 4   borough              1000 non-null   object 
 5   payment_method       1000 non-null   object 
 6   fare_class_category  1000 non-null   object 
 7   ridership            1000 non-null   float64
 8   transfers            1000 non-null   float64
 9   latitude             1000 non-null   float64
 10  longitude            1000 non-null   float64
 11  georeference         1000 non-null   object 
dtypes: float64(4), int64(1), object(7)
memory usage: 93.9+ KB
None
Checking Null Values ... 
 ----- 
 transit_timestamp      0
transit_mode           0
station_complex_id     0
station_complex 

In [3]:
journey_specific_data_2015_19.head()
print("Checking Null Values ...\n ---- \n", journey_specific_data_2015_19.isnull().sum())
print('Column Names : ', journey_specific_data_2015_19.columns)

Checking Null Values ...
 ---- 
 month                       0
division                    0
line                        0
period                      0
num_passengers              0
additional_platform_time    0
additional_train_time       0
total_apt                   0
total_att                   0
over_five_mins              0
over_five_mins_perc         0
customer_journey_time       0
dtype: int64
Column Names :  Index(['month', 'division', 'line', 'period', 'num_passengers',
       'additional_platform_time', 'additional_train_time', 'total_apt',
       'total_att', 'over_five_mins', 'over_five_mins_perc',
       'customer_journey_time'],
      dtype='object')


In [4]:
journey_specific_data_2019_Present.head()
print("Checking Null Values ...\n ---- \n", journey_specific_data_2019_Present.isnull().sum())
print('Column Names : ',journey_specific_data_2019_Present.columns)

Checking Null Values ...
 ---- 
 month                       0
division                    0
line                        0
period                      0
num_passengers              0
additional_platform_time    0
additional_train_time       0
total_apt                   0
total_att                   0
over_five_mins              0
over_five_mins_perc         0
customer_journey_time       0
dtype: int64
Column Names :  Index(['month', 'division', 'line', 'period', 'num_passengers',
       'additional_platform_time', 'additional_train_time', 'total_apt',
       'total_att', 'over_five_mins', 'over_five_mins_perc',
       'customer_journey_time'],
      dtype='object')


Converting transit_timestamp to datetime and extract month for alignment with journey_specific_data

In [5]:
hourly_ridership_data['transit_timestamp']= pd.to_datetime(hourly_ridership_data['transit_timestamp'])
hourly_ridership_data['month']= hourly_ridership_data['transit_timestamp'].dt.to_period('M')

Combine month and period in journey data to create a comparable temporal key

Journey data uses the period field to split the month into chunks

In [6]:
def period_to_date(period):
    if 'first' in period.lower():
        return '01'
    elif'second' in period.lower():
        return '15'

journey_specific_data_2019_Present['day'] = journey_specific_data_2019_Present['period'].apply(period_to_date)
journey_specific_data_2019_Present['date'] = pd.to_datetime(journey_specific_data_2019_Present['month']+'-'+journey_specific_data_2019_Present['day'])

Aligning Station Complex and Line data using fuzzy matching 

In [7]:
from fuzzywuzzy import process

def fuzzy_match_station_complex(station, lines):
    match, score= process.extractOne(station, lines)
    return match if score>80 else None

hourly_ridership_data['line']= hourly_ridership_data['station_complex'].apply(lambda x: fuzzy_match_station_complex(x, journey_specific_data_2019_Present['line'].unique()))

ModuleNotFoundError: No module named 'fuzzywuzzy'