In [1]:
import pandas as pd

class NJCleaner:

    def __init__(self, csv_path:str):
        df = pd.read_csv(csv_path)
        self.data = df

    def order_by_scheduled_time(self) -> pd.DataFrame:
        self.data = self.data.sort_values(by=['scheduled_time'])
        return self.data
    
    def drop_columns_and_nan(self) -> pd.DataFrame:
        self.data = self.data.drop(['from', 'to'], axis=1)
        self.data = self.data.dropna()
        return self.data

    def convert_date_to_day(self) -> pd.DataFrame:
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data['day'] = self.data['date'].dt.day_name()
        self.data = self.data.drop(['date'], axis=1)
        return self.data
    
    def convert_scheduled_time_to_part_of_the_day(self) -> pd.DataFrame:
        df = pd.DatetimeIndex(self.data['scheduled_time']).hour.astype('float')
        self.data['part_of_the_day'] = pd.cut(df, bins=[0, 3, 7, 11, 15, 19, 23], 
                                              labels=['early_monrning', 'morning', 'afternoon', 'evening', 'night', 'late_night']) 
        self.data.drop(['scheduled_time'], axis=1, inplace=True) 
        return self.data
    
    def convert_delay(self) -> pd.DataFrame:
        max = self.data['delay_minutes'].astype('float').max()
        self.data['delay'] = pd.cut(self.data['delay_minutes'].astype('float'), bins=[-1, 5, max], labels=[0, 1])
        return self.data
    
    def drop_unnecessary_columns(self) -> pd.DataFrame:
        self.data.drop(['train_id', 'actual_time', 'delay_minutes'], axis=1, inplace=True)
        return self.data
    
    def save_first_60k(self, path:str):
        cutted_df = self.data.head(60000)
        cutted_df.to_csv(path)

    def prep_df(self, path:str='data/NJ.csv'):
        self.order_by_scheduled_time()
        self.drop_columns_and_nan()
        self.convert_date_to_day()
        self.convert_scheduled_time_to_part_of_the_day()
        self.convert_delay()
        self.drop_unnecessary_columns()
        self.save_first_60k(path)

In [2]:
nj_cleaner = NJCleaner("2018_03.csv")
ordered_df = nj_cleaner.data
ordered_df = nj_cleaner.order_by_scheduled_time()
ordered_df

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
5283,2018-03-01,3806,1.0,Trenton,148,Trenton,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit
1270,2018-03-01,0042,1.0,Port Jervis,123,Port Jervis,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit
5284,2018-03-01,3806,2.0,Trenton,148,Hamilton,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit
208,2018-03-01,3202,1.0,Long Branch,74,Long Branch,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit
5285,2018-03-01,3806,3.0,Hamilton,32905,Princeton Junction,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256459,2018-03-31,A663,,Trenton,148,Philadelphia,1,,2018-03-31 11:20:08,,departed,KEYSTONE,Amtrak
256460,2018-03-31,A664,,Philadelphia,1,Philadelphia,1,,2018-03-31 11:26:07,,departed,Amtrak,Amtrak
256461,2018-03-31,A664,,Philadelphia,1,Trenton,148,,2018-03-31 12:07:04,,departed,Amtrak,Amtrak
256462,2018-03-31,A664,,Trenton,148,Newark Penn Station,107,,2018-03-31 12:47:04,,departed,Amtrak,Amtrak


In [3]:
dropped_df = nj_cleaner.drop_columns_and_nan()
dropped_df

Unnamed: 0,date,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
5283,2018-03-01,3806,1.0,148,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit
1270,2018-03-01,0042,1.0,123,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit
5284,2018-03-01,3806,2.0,148,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit
208,2018-03-01,3202,1.0,74,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit
5285,2018-03-01,3806,3.0,32905,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit
...,...,...,...,...,...,...,...,...,...,...,...
253745,2018-03-31,4705,7.0,15,141,2018-04-01 03:04:00,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit
256387,2018-03-31,0709,13.0,117,49,2018-04-01 03:05:00,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit
253746,2018-03-31,4705,8.0,141,79,2018-04-01 03:07:00,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit
253747,2018-03-31,4705,9.0,79,122,2018-04-01 03:13:00,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit


In [4]:
day_df = nj_cleaner.convert_date_to_day()
day_df

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day
5283,3806,1.0,148,148,2018-03-01 03:48:00,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday
1270,0042,1.0,123,123,2018-03-01 03:50:00,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday
5284,3806,2.0,148,32905,2018-03-01 03:54:00,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday
208,3202,1.0,74,74,2018-03-01 03:58:00,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday
5285,3806,3.0,32905,125,2018-03-01 04:00:00,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday
...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:00,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday
256387,0709,13.0,117,49,2018-04-01 03:05:00,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday
253746,4705,8.0,141,79,2018-04-01 03:07:00,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday
253747,4705,9.0,79,122,2018-04-01 03:13:00,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday


In [5]:
part_of_the_day_df = nj_cleaner.convert_scheduled_time_to_part_of_the_day()
part_of_the_day_df

Unnamed: 0,train_id,stop_sequence,from_id,to_id,actual_time,delay_minutes,status,line,type,day,part_of_the_day
5283,3806,1.0,148,148,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning
1270,0042,1.0,123,123,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday,early_monrning
5284,3806,2.0,148,32905,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning
208,3202,1.0,74,74,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday,early_monrning
5285,3806,3.0,32905,125,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday,morning
...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning
256387,0709,13.0,117,49,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday,early_monrning
253746,4705,8.0,141,79,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning
253747,4705,9.0,79,122,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning


In [6]:
delay_df = nj_cleaner.convert_delay()
delay_df

Unnamed: 0,train_id,stop_sequence,from_id,to_id,actual_time,delay_minutes,status,line,type,day,part_of_the_day,delay
5283,3806,1.0,148,148,2018-03-01 04:02:07,14.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning,1
1270,0042,1.0,123,123,2018-03-01 03:50:04,0.066667,departed,Bergen Co. Line,NJ Transit,Thursday,early_monrning,0
5284,3806,2.0,148,32905,2018-03-01 04:02:07,8.116667,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning,1
208,3202,1.0,74,74,2018-03-01 03:58:01,0.016667,departed,No Jersey Coast,NJ Transit,Thursday,early_monrning,0
5285,3806,3.0,32905,125,2018-03-01 04:02:07,2.116667,departed,Northeast Corrdr,NJ Transit,Thursday,morning,0
...,...,...,...,...,...,...,...,...,...,...,...,...
253745,4705,7.0,15,141,2018-04-01 03:04:03,0.050000,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0
256387,0709,13.0,117,49,2018-04-01 03:04:00,0.000000,estimated,Gladstone Branch,NJ Transit,Saturday,early_monrning,0
253746,4705,8.0,141,79,2018-04-01 03:08:07,1.116667,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0
253747,4705,9.0,79,122,2018-04-01 03:13:09,0.150000,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0


In [7]:
unnec_df = nj_cleaner.drop_unnecessary_columns()
unnec_df

Unnamed: 0,stop_sequence,from_id,to_id,status,line,type,day,part_of_the_day,delay
5283,1.0,148,148,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning,1
1270,1.0,123,123,departed,Bergen Co. Line,NJ Transit,Thursday,early_monrning,0
5284,2.0,148,32905,departed,Northeast Corrdr,NJ Transit,Thursday,early_monrning,1
208,1.0,74,74,departed,No Jersey Coast,NJ Transit,Thursday,early_monrning,0
5285,3.0,32905,125,departed,Northeast Corrdr,NJ Transit,Thursday,morning,0
...,...,...,...,...,...,...,...,...,...
253745,7.0,15,141,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0
256387,13.0,117,49,estimated,Gladstone Branch,NJ Transit,Saturday,early_monrning,0
253746,8.0,141,79,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0
253747,9.0,79,122,departed,No Jersey Coast,NJ Transit,Saturday,early_monrning,0


In [9]:
head_df = nj_cleaner.save_first_60k('data/NJ.csv')

In [10]:
njcleaner = NJCleaner("2018_03.csv")
njcleaner.prep_df()