In [1]:
# Import dependencies
import pandas as pd
import os
import numpy as np
from datetime import datetime

In [2]:
# Create a directory
directory = './data'
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
csv_files

['201907-citibike-tripdata_1.csv',
 '201907-citibike-tripdata_2.csv',
 '201907-citibike-tripdata_3.csv',
 '202007-citibike-tripdata_1.csv',
 '202007-citibike-tripdata_2.csv',
 '202007-citibike-tripdata_3.csv']

In [3]:
# Loop through each csv file and read into the dataframe
dataframes = []
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file))
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrames
bike_df = pd.concat(dataframes, ignore_index=True)
bike_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,897,2019-07-01 00:00:00.1320,2019-07-01 00:14:58.0040,493.0,W 45 St & 6 Ave,40.756800,-73.982912,454.0,E 51 St & 1 Ave,40.754557,-73.965930,18340,Subscriber,1966,1
1,267,2019-07-01 00:00:05.1780,2019-07-01 00:04:32.4500,3143.0,5 Ave & E 78 St,40.776321,-73.964274,3226.0,W 82 St & Central Park West,40.782750,-73.971370,21458,Customer,1996,1
2,2201,2019-07-01 00:00:05.2130,2019-07-01 00:36:46.7490,317.0,E 6 St & Avenue B,40.724537,-73.981854,3469.0,India St & West St,40.731814,-73.959950,39874,Subscriber,1986,1
3,1660,2019-07-01 00:00:08.6010,2019-07-01 00:27:48.8050,249.0,Harrison St & Hudson St,40.718710,-74.009001,369.0,Washington Pl & 6 Ave,40.732241,-74.000264,38865,Subscriber,1988,1
4,109,2019-07-01 00:00:12.1580,2019-07-01 00:02:01.5670,3552.0,W 113 St & Broadway,40.805973,-73.964928,3538.0,W 110 St & Amsterdam Ave,40.802692,-73.962950,30256,Subscriber,1997,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4286867,877,2020-07-31 23:59:48.9250,2020-08-01 00:14:26.2540,217.0,Old Fulton St,40.702772,-73.993836,398.0,Atlantic Ave & Furman St,40.691652,-73.999979,44073,Customer,1969,0
4286868,839,2020-07-31 23:59:51.7740,2020-08-01 00:13:50.9750,4044.0,8 Ave & W 38 St,40.754610,-73.991770,3163.0,Central Park West & W 68 St,40.773407,-73.977825,41797,Customer,2002,1
4286869,1042,2020-07-31 23:59:56.1460,2020-08-01 00:17:18.5090,3723.0,Cadman Plaza E & Johnson St,40.695317,-73.990157,350.0,Clinton St & Grand St,40.715595,-73.987030,24808,Subscriber,1974,1
4286870,699,2020-07-31 23:59:57.4310,2020-08-01 00:11:36.5420,161.0,LaGuardia Pl & W 3 St,40.729170,-73.998102,504.0,1 Ave & E 16 St,40.732219,-73.981656,41486,Subscriber,1991,1


In [4]:
bike_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id           float64
start station name          object
start station latitude     float64
start station longitude    float64
end station id             float64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

In [5]:
#clean empty data
bike_df.dropna()

# Remove unknown Gender
bike_df = bike_df[bike_df.gender != 0]

# Remove trip durations that are less than 300 seconds, more than 3600 seconds, and with the same start and en stations
bike_df = bike_df.loc[((bike_df["tripduration"]>= 300) & (bike_df["tripduration"] <= 2400) & (bike_df["start station id"] != bike_df["end station id"])),:]

In [6]:
bike_df.count()

tripduration               2980325
starttime                  2980325
stoptime                   2980325
start station id           2980288
start station name         2980288
start station latitude     2980325
start station longitude    2980325
end station id             2980288
end station name           2980288
end station latitude       2980325
end station longitude      2980325
bikeid                     2980325
usertype                   2980325
birth year                 2980325
gender                     2980325
dtype: int64

In [7]:
output_file = './data/citi_bike.csv'
bike_df.to_csv(output_file, index=False)