In [1]:
import pandas as pd
import os, sys
import numpy as np

In [14]:
def clean_stops(df, verbose=False):
    """
    Iterates through all series associated with each progrnum and stop_id, finds the id of stops which appear less frequently. Then finds the indexes of rows with the progrnum and these stops.
    Makes note of all these indexes and then removes them all.
    """
    indexes_to_be_removed = []
    for i in range(1,max(df['progrnum'])+1):
        series = df[df['progrnum'] == i]['stop_id'].value_counts()
        if verbose:
            print(series)
        stops_to_be_removed = series.index.tolist()[1:]
        if verbose:
            print("Checking progrnum: ",i," inconsistent stops are: ",stops_to_be_removed)
        if len(stops_to_be_removed) > 0:
            indexes_to_be_removed += list(df[df['progrnum'] == i][df['stop_id'].isin(stops_to_be_removed)].index)
    if verbose:
        print("Removing ",len(indexes_to_be_removed), " rows")
    df.drop(df.index[indexes_to_be_removed], inplace=True)


In [3]:
from scipy import stats
def clean_outliers(df, verbose=False):
    """
    Iterates through all series associated with each progrnum and cum_duration, finds the indexs of those rows with values more than 3 standard deviations from the mean.
    Makes note of all these indexes and then removes them all
    """
    indexes_to_be_removed = []
    for i in range(1,max(df['progrnum'])+1):
        series = df[df['progrnum'] == i]['cum_duration']
        indexes_to_be_removed += list(series[((np.abs(stats.zscore(series)) > 3))].index)
    if verbose:
        print("Removing ",len(indexes_to_be_removed), " rows")
    df.drop(indexes_to_be_removed, inplace=True)


In [9]:
def test_of_removing(route_id, verbose=False, outliers_verbose=False):
    if verbose:
        print("="*20)
        print(route_id)
    df = pd.read_csv('D:\\data\\csvs\\csvs\\' + str(route_id) + '_1.csv')
    df = df.drop(columns=['dt','Unnamed: 0','tripid','weather_main'])
    len_before = len(df)
    clean_stops(df)
    clean_outliers(df, outliers_verbose)
    len_after = len(df)
    if verbose:
        print("Total rows removed: ", len_before-len_after)
        print("Percentage of rows removed: ", (len_before-len_after)*100/len_before,"%")
        print("="*20)
    return (len_before-len_after)*100/len_before

In [7]:
from tqdm import tqdm_notebook as tqdm
route_ids = ['68', '25B', '45A', '25A', '14', '77A', '39', '16', '40D', '27B', '142', '83', '130', '15', '46A', '33', '7', '39A', '49', '1', '123', '41', '67X', '59', '9', '40', '239', '76', '84', '53', '185', '151', '13', '15B', '65B', '29A', '61', '140', '79A', '38A', '31', '33B', '69', '44', '42', '67', '184', '238', '145', '17A', '32', '27A', '17', '27X', '18', '122', '54A', '66', '150', '56A', '37', '27', '15A', '65', '11', '47', '79', '83A', '63', '4', '120', '41C', '70', '84A', '220', '39X', '32X', '68A', '84X', '38', '102', '270', '51X', '33X', '75', '26', '66A', '31A', '111', '14C', '114', '76A', '44B', '161', '7A', '43', '25', '104', '33A', '16C', '42D', '31B', '66X', '31D', '33D', '41B', '40B', '7D', '46E', '38D', '118', '51D', '15D', '41A', '25D', '66B', '38B', '236', '7B', '41X', '69X', '68X', '25X', '40E', '70D', '116', '77X', '16D', '33E', '41D']
routes_to_check = []
for route in tqdm(route_ids):
    try:
        percent = test_of_removing(route)
        if percent > 3:
            routes_to_check.append(route)
    except Exception as e:
        print(e)
print(routes_to_check)

HBox(children=(IntProgress(value=0, max=130), HTML(value='')))

  del sys.path[0]
  # Remove the CWD from sys.path while we load stuff.


[Errno 2] File b'D:\\data\\csvs\\csvs\\51X_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\51X_1.csv'
[Errno 2] File b'D:\\data\\csvs\\csvs\\46E_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\46E_1.csv'
[Errno 2] File b'D:\\data\\csvs\\csvs\\118_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\118_1.csv'
[Errno 2] File b'D:\\data\\csvs\\csvs\\41A_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\41A_1.csv'
[Errno 2] File b'D:\\data\\csvs\\csvs\\68X_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\68X_1.csv'
[Errno 2] File b'D:\\data\\csvs\\csvs\\77X_1.csv' does not exist: b'D:\\data\\csvs\\csvs\\77X_1.csv'
['68', '16', '40D', '142', '83', '15', '33', '41', '67X', '59', '84', '13', '33B', '17A', '17', '27X', '47', '4', '84A', '220', '84X', '38', '33X', '75', '114', '7A', '104', '42D', '66X', '33D', '236', '41X', '25X']


In [10]:
from tqdm import tqdm_notebook as tqdm
for route in tqdm(routes_to_check):
    percent = test_of_removing(route, verbose=True, outliers_verbose=True)

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))

68


  del sys.path[0]


Removing  2897  rows
Total rows removed:  25287
Percentage of rows removed:  6.965616787778286 %
16
Removing  6401  rows
Total rows removed:  175666
Percentage of rows removed:  12.548073351505488 %
40D
Removing  3945  rows
Total rows removed:  50094
Percentage of rows removed:  8.775980715056551 %
142
Removing  421  rows
Total rows removed:  4743
Percentage of rows removed:  11.60281814178776 %
83
Removing  12075  rows
Total rows removed:  93738
Percentage of rows removed:  9.367561204237978 %
15
Removing  12981  rows
Total rows removed:  205032
Percentage of rows removed:  11.54197555397058 %
33
Removing  7710  rows
Total rows removed:  24129
Percentage of rows removed:  6.595488203891855 %
41
Removing  4657  rows
Total rows removed:  29659
Percentage of rows removed:  4.074850073160176 %
67X
Removing  731  rows
Total rows removed:  11600
Percentage of rows removed:  16.228088582980092 %
59
Removing  312  rows
Total rows removed:  7175
Percentage of rows removed:  7.799676055266276 %

In [15]:
df = pd.read_csv('D:\\data\\csvs\\csvs\\' + '75' + '_1.csv')
df = df.drop(columns=['dt','Unnamed: 0','tripid','weather_main'])
print(len(df))
clean_stops(df, verbose=True)
clean_outliers(df, True)

83498
4348.0    954
4646.0      4
2538.0      2
Name: stop_id, dtype: int64
Checking progrnum:  1  inconsistent stops are:  [4646.0, 2538.0]


  from ipykernel import kernelapp as app


4646.0    960
Name: stop_id, dtype: int64
Checking progrnum:  2  inconsistent stops are:  []
4647.0    965
Name: stop_id, dtype: int64
Checking progrnum:  3  inconsistent stops are:  []
4435.0    961
2538.0      4
Name: stop_id, dtype: int64
Checking progrnum:  4  inconsistent stops are:  [2538.0]
2605.0    539
2604.0    423
Name: stop_id, dtype: int64
Checking progrnum:  5  inconsistent stops are:  [2604.0]
2538.0    539
2605.0    423
Name: stop_id, dtype: int64
Checking progrnum:  6  inconsistent stops are:  [2605.0]
2539.0    544
2538.0    423
Name: stop_id, dtype: int64
Checking progrnum:  7  inconsistent stops are:  [2538.0]
2540.0    544
2539.0    424
Name: stop_id, dtype: int64
Checking progrnum:  8  inconsistent stops are:  [2539.0]
2542.0    544
2540.0    424
Name: stop_id, dtype: int64
Checking progrnum:  9  inconsistent stops are:  [2540.0]
6128.0    544
2542.0    424
Name: stop_id, dtype: int64
Checking progrnum:  10  inconsistent stops are:  [2542.0]
2544.0    544
6128.0  

Checking progrnum:  73  inconsistent stops are:  [2018.0]
2020.0    544
2019.0    424
Name: stop_id, dtype: int64
Checking progrnum:  74  inconsistent stops are:  [2019.0]
2021.0    544
2020.0    424
Name: stop_id, dtype: int64
Checking progrnum:  75  inconsistent stops are:  [2020.0]
2022.0    544
2021.0    424
Name: stop_id, dtype: int64
Checking progrnum:  76  inconsistent stops are:  [2021.0]
4565.0    544
2022.0    424
Name: stop_id, dtype: int64
Checking progrnum:  77  inconsistent stops are:  [2022.0]
4566.0    544
4565.0    424
Name: stop_id, dtype: int64
Checking progrnum:  78  inconsistent stops are:  [4565.0]
4567.0    544
4566.0    424
Name: stop_id, dtype: int64
Checking progrnum:  79  inconsistent stops are:  [4566.0]
2031.0    543
4567.0    424
Name: stop_id, dtype: int64
Checking progrnum:  80  inconsistent stops are:  [4567.0]
2032.0    542
2031.0    424
Name: stop_id, dtype: int64
Checking progrnum:  81  inconsistent stops are:  [2031.0]
2033.0    542
2032.0    424
Na