In [76]:
import pandas as pd
from datetime import datetime

class NJCleaner:

    def __init__(self, csv_path:str):
        df = pd.read_csv(csv_path)
        self.data = df

    def order_by_scheduled_time(self) -> pd.DataFrame:
        self.data = self.data.sort_values(by=['scheduled_time'])
        return self.data
    
    def drop_columns_and_nan(self) -> pd.DataFrame:
        self.data = self.data.drop(['from', 'to'], axis=1)
        self.data = self.data.dropna()
        return self.data

    def convert_date_to_day(self) -> pd.DataFrame:
        self.data['date'] = pd.to_datetime(self.data['date'])
        self.data['day'] = self.data['date'].dt.day_name()
        self.data = self.data.drop(['date'], axis=1)
        return self.data
    
    def convert_scheduled_time_to_part_of_the_day(self) -> pd.DataFrame:
        df = pd.DatetimeIndex(self.data['scheduled_time']).hour
        self.data['part_of_the_day'] = pd.cut(df, bins=[-1, 3, 7, 11, 15, 19, 24], 
                                              labels=[ 'late_night', 'early_morning', 'morning', 'afternoon', 'evening', 'night']) 
        self.data.drop(columns=['scheduled_time'], inplace=True) 
        return self.data

    def convert_delay(self) -> pd.DataFrame:
        max = self.data['delay_minutes'].astype('float').max()
        self.data['delay'] = pd.cut(self.data['delay_minutes'].astype('float'), bins=[-1, 5, max], labels=['0', '1'])
        return self.data

    
    def drop_unnecessary_columns(self) -> pd.DataFrame:
        self.data.drop(['train_id', 'actual_time', 'delay_minutes'], axis=1, inplace=True)
        return self.data
    
    def save_first_60k(self, path:str):
        cutted_df = self.data.head(60000)
        cutted_df.to_csv(path, index=False)

    def prep_df(self, path:str='data/NJ.csv'):
        self.order_by_scheduled_time()
        self.drop_columns_and_nan()
        self.convert_date_to_day()
        self.convert_scheduled_time_to_part_of_the_day()
        self.convert_delay()
        self.drop_unnecessary_columns()
        self.save_first_60k(path)

In [None]:
nj_cleaner = NJCleaner("2018_03.csv")
ordered_df = nj_cleaner.data
ordered_df = nj_cleaner.order_by_scheduled_time()
ordered_df

In [None]:
dropped_df = nj_cleaner.drop_columns_and_nan()
dropped_df

In [None]:
day_df = nj_cleaner.convert_date_to_day()
day_df

In [None]:
part_of_the_day_df = nj_cleaner.convert_scheduled_time_to_part_of_the_day()
part_of_the_day_df

In [None]:
delay_df = nj_cleaner.convert_delay()
delay_df

In [None]:
unnec_df = nj_cleaner.drop_unnecessary_columns()
unnec_df

In [None]:
print(unnec_df['part_of_the_day'])

In [77]:
head_df = nj_cleaner.save_first_60k('data/NJ.csv')

In [71]:
njcleaner = NJCleaner("2018_03.csv")
njcleaner.prep_df()

In [62]:
unnec_df.loc[[58548]]

Unnamed: 0,stop_sequence,from_id,to_id,status,line,type,day,part_of_the_day
58548,14.0,145,81,cancelled,Gladstone Branch,NJ Transit,Thursday,morning


In [69]:
df = pd.read_csv('test.csv')
df
nondf = pd.DatetimeIndex(df['date']).hour
nondf

Int64Index([1, 2, 3, 4, 0], dtype='int64', name='date')