In [65]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from os.path import dirname, abspath
from datetime import datetime

d = dirname(dirname(abspath("preprocessing.ipynb")))
df = pd.read_csv(d + '/data/Full_Concat.csv')

df1 = df.drop(columns=['start station id', 'end station id', 'bikeid','Unnamed: 0'])
print(df1.shape)
df1.head()

(759807, 12)


Unnamed: 0,tripduration,starttime,stoptime,start station name,start station latitude,start station longitude,end station name,end station latitude,end station longitude,usertype,birth year,gender
0,142,2019-02-01 15:35:02.0820,2019-02-01 15:37:24.1360,Exchange Place,40.716247,-74.033459,Harborside,40.719252,-74.034234,Subscriber,1963.0,1
1,223,2019-02-01 17:00:46.8900,2019-02-01 17:04:30.5500,Exchange Place,40.716247,-74.033459,Grand St,40.715178,-74.037683,Subscriber,1992.0,2
2,106,2019-02-01 17:08:01.3260,2019-02-01 17:09:47.4400,Exchange Place,40.716247,-74.033459,Paulus Hook,40.714145,-74.033552,Subscriber,1960.0,1
3,370,2019-02-01 17:09:31.2100,2019-02-01 17:15:41.6550,Exchange Place,40.716247,-74.033459,Newark Ave,40.721525,-74.046305,Subscriber,1976.0,1
4,315,2019-02-01 17:19:53.2490,2019-02-01 17:25:09.1400,Exchange Place,40.716247,-74.033459,Manila & 1st,40.721651,-74.042884,Subscriber,1980.0,1


In [66]:
# new data frame with split value columns 
starts = df1['starttime'].str.split(" ", n = 1, expand = True) 
ends = df1['stoptime'].str.split(" ", n = 1, expand = True) 

  
# making separate last name column from new data frame 
df1['Start Time']= starts[1] 
df1['End Time']= ends[1] 
  
# Dropping old Name columns 
df1.drop(columns =['starttime','stoptime'], inplace = True) 

df1['Start Time'] = df1['Start Time'].map(lambda x: pd.Timestamp(x))
df1['End Time'] = df1['End Time'].map(lambda x: pd.Timestamp(x))

min_time = min(min(df1['Start Time']),min(df1['End Time']))

df1['Start Time'] = df1['Start Time'].map(lambda x: pd.Timedelta(x - min_time))
df1['End Time'] = df1['End Time'].map(lambda x: pd.Timedelta(x - min_time))

df1['Start Time'] = df1['Start Time'].dt.total_seconds()
df1['End Time'] = df1['End Time'].dt.total_seconds()

df1.head()
df1.shape


(759807, 12)

In [76]:
df1.to_csv(d + '/data/Semi_processed_data.csv',index=False)

In [67]:
cat_ftrs = ['start station name', 'end station name', 'gender']
cont_ftrs = ['tripduration','start station longitude', 'end station longitude','start station latitude',
             'end station latitude','Start Time','End Time','birth year']
label = ['usertype']

In [68]:
#Label Encoder
le = LabelEncoder()

df_usertype = pd.DataFrame(le.fit_transform(df1['usertype'].to_numpy().reshape(-1,1)), columns = label)


  y = column_or_1d(y, warn=True)


In [69]:
#One Hot Encoder

#start station name
enc = OneHotEncoder(sparse=False)

df_start_station = pd.DataFrame(enc.fit_transform(df1['start station name'].to_numpy().reshape(-1,1)))
df_start_station.columns = enc.get_feature_names()


#end station name
df_end_station = pd.DataFrame(enc.fit_transform(df1['end station name'].to_numpy().reshape(-1,1)))
df_end_station.columns = enc.get_feature_names()

#gender
df_gender = pd.DataFrame(enc.fit_transform(df1['gender'].to_numpy().reshape(-1,1)))
df_gender.columns = enc.get_feature_names()


In [70]:
#merge dfs
df_all = pd.concat([df_usertype, df_start_station, df_end_station,
                    df_gender],axis = 1)

df2 = df1

df2 = df2.drop(['usertype', 'start station name', 'end station name',
                'gender'], axis = 1)

df_mcar = pd.concat([df2, df_all],axis = 1)

df_mcar.shape
#df_mcar.head()

(759807, 254)

In [15]:
#MCAR TEST

# from the pymice package 
# https://github.com/RianneSchouten/pymice

import numpy as np
import pandas as pd
import math as ma
import scipy.stats as st

def checks_input_mcar_tests(data):
    """ Checks whether the input parameter of class McarTests is correct
            Parameters
            ----------
            data:
                The input of McarTests specified as 'data'
            Returns
            -------
            bool
                True if input is correct
            """

    if not isinstance(data, pd.DataFrame):
        print("Error: Data should be a Pandas DataFrame")
        return False

    if not any(data.dtypes.values == np.float):
        if not any(data.dtypes.values == np.int):
            print("Error: Dataset cannot contain other value types than floats and/or integers")
            return False

    if not data.isnull().values.any():
        print("Error: No NaN's in given data")
        return False

    return True


def mcar_test(data):
    """ Implementation of Little's MCAR test
    Parameters
    ----------
    data: Pandas DataFrame
        An incomplete dataset with samples as index and variables as columns
    Returns
    -------
    p_value: Float
        This value is the outcome of a chi-square statistical test, testing whether the null hypothesis
        'the missingness mechanism of the incomplete dataset is MCAR' can be rejected.
    """

    if not checks_input_mcar_tests(data):
        raise Exception("Input not correct")

    dataset = data.copy()
    vars = dataset.dtypes.index.values
    n_var = dataset.shape[1]

    # mean and covariance estimates
    # ideally, this is done with a maximum likelihood estimator
    gmean = dataset.mean()
    gcov = dataset.cov()

    # set up missing data patterns
    r = 1 * dataset.isnull()
    mdp = np.dot(r, list(map(lambda x: ma.pow(2, x), range(n_var))))
    sorted_mdp = sorted(np.unique(mdp))
    n_pat = len(sorted_mdp)
    correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
    dataset['mdp'] = pd.Series(correct_mdp, index=dataset.index)

    # calculate statistic and df
    pj = 0
    d2 = 0
    for i in range(n_pat):
        dataset_temp = dataset.loc[dataset['mdp'] == i, vars]
        select_vars = ~dataset_temp.isnull().any()
        pj += np.sum(select_vars)
        select_vars = vars[select_vars]
        means = dataset_temp[select_vars].mean() - gmean[select_vars]
        select_cov = gcov.loc[select_vars, select_vars]
        mj = len(dataset_temp)
        parta = np.dot(means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])))
        d2 += mj * (np.dot(parta, means))

    df = pj - n_var

    # perform test and save output
    p_value = 1 - st.chi2.cdf(d2, df)

    return p_value



In [19]:
checks_input_mcar_tests(df_mcar)


True

In [27]:
#mcar_test(df_mcar)

In [71]:
print(df1.shape)
print(df1.isnull().sum(axis=0)/df1.shape[0])
print(sum(df1.isnull().sum(axis=1)!=0)/df1.shape[0])

(759807, 12)
tripduration               0.000000
start station name         0.000000
start station latitude     0.000000
start station longitude    0.000000
end station name           0.000000
end station latitude       0.000000
end station longitude      0.000000
usertype                   0.000000
birth year                 0.011198
gender                     0.000000
Start Time                 0.000000
End Time                   0.000000
dtype: float64
0.011197580438190224


In [72]:
#drop missing rows
print(df1.shape)
# by default, rows/points are dropped
df2 = df1.dropna()
print(df2.shape)

(759807, 12)
(751299, 12)


In [73]:
#Standart Scaler

#trip durration
scaler = StandardScaler()
df_trip_durration = pd.DataFrame(scaler.fit_transform(df_mcar['tripduration'].to_numpy().reshape(-1,1)))
df_trip_durration.columns = ['tripduration']

#start station longitude
df_start_staationlon = pd.DataFrame(scaler.fit_transform(df_mcar['start station longitude'].to_numpy().reshape(-1,1)))
df_start_staationlon.columns = ['start station longitude']

#end station longitude
df_end_staationlon = pd.DataFrame(scaler.fit_transform(df_mcar['end station longitude'].to_numpy().reshape(-1,1)))
df_end_staationlon.columns = ['end station longitude']

#start station latitude
df_start_stationlat = pd.DataFrame(scaler.fit_transform(df_mcar['start station latitude'].to_numpy().reshape(-1,1)))
df_start_stationlat.columns = ['start station latitude']

#end station latitude
df_end_stationlat = pd.DataFrame(scaler.fit_transform(df_mcar['end station latitude'].to_numpy().reshape(-1,1)))
df_end_stationlat.columns = ['end station latitude']

#Start Time
df_start_time = pd.DataFrame(scaler.fit_transform(df_mcar['Start Time'].to_numpy().reshape(-1,1)))
df_start_time.columns = ['Start Time']

#End Time
df_end_time = pd.DataFrame(scaler.fit_transform(df_mcar['End Time'].to_numpy().reshape(-1,1)))
df_end_time.columns = ['End Time']

#birth year
df_birth_year = pd.DataFrame(scaler.fit_transform(df_mcar['birth year'].to_numpy().reshape(-1,1)))
df_birth_year.columns = ['birth year']


In [74]:
#merge dfs
df_all2 = pd.concat([df_birth_year, df_start_time, df_start_time, df_trip_durration,
                    df_end_stationlat, df_start_stationlat, df_end_staationlon, df_start_staationlon],axis = 1)

df3 = df_mcar

df3 = df3.drop(['tripduration','start station longitude',
                                        'end station longitude','start station latitude',
                                        'end station latitude','Start Time','End Time',
                                        'birth year'], axis = 1)

df_preprocessed = pd.concat([df3, df_all2],axis = 1)

df_preprocessed.shape
df_preprocessed.head()


Unnamed: 0,usertype,x0_5 Corners Library,x0_Astor Place,x0_Baldwin at Montgomery,x0_Bayside Park,x0_Bergen Ave,x0_Bethune Center,x0_Brunswick & 6th,x0_Brunswick St,x0_Christ Hospital,...,x0_1,x0_2,birth year,Start Time,Start Time.1,tripduration,end station latitude,start station latitude,end station longitude,start station longitude
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-1.718326,0.280763,0.280763,-0.067222,0.029851,-0.886967,-0.027368,1.178391
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.105304,0.559398,0.559398,-0.057472,0.026735,-0.886967,-0.028819,1.178391
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-2.010425,0.582926,0.582926,-0.071555,0.025945,-0.886967,-0.027081,1.178391
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.452561,0.587794,0.587794,-0.039778,0.031589,-0.886967,-0.032445,1.178391
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.063095,0.621483,0.621483,-0.046398,0.031685,-0.886967,-0.031007,1.178391


In [75]:
df_preprocessed.to_csv(d + '/data/Preprocessed_Citi_Bike_Data.csv',index=False)