## Imports

In [1]:
import pandas as pd

## Read base dataset

In [2]:
data= pd.read_csv('Trips_2018.csv')

### Basic information

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17548339 entries, 0 to 17548338
Data columns (total 14 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Unnamed: 0               int64  
 1   tripduration             int64  
 2   starttime                object 
 3   stoptime                 object 
 4   start_station_id         float64
 5   start_station_latitude   float64
 6   start_station_longitude  float64
 7   end_station_id           float64
 8   end_station_latitude     float64
 9   end_station_longitude    float64
 10  bikeid                   int64  
 11  usertype                 object 
 12  birth_year               int64  
 13  gender                   int64  
dtypes: float64(6), int64(5), object(3)
memory usage: 1.8+ GB


In [4]:
data.describe()

Unnamed: 0.1,Unnamed: 0,tripduration,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bikeid,birth_year,gender
count,17548340.0,17548340.0,17545840.0,17548340.0,17548340.0,17545840.0,17548340.0,17548340.0,17548340.0,17548340.0,17548340.0
mean,8774169.0,988.7432,1589.282,40.73737,-73.9826,1580.83,40.73704,-73.98281,26561.41,1978.993,1.148613
std,5065769.0,18895.84,1439.432,0.03225813,0.01912753,1438.502,0.03206583,0.01919611,6222.916,11.92922,0.5438043
min,0.0,61.0,72.0,40.64654,-74.02535,72.0,40.64654,-74.08364,14529.0,1885.0,0.0
25%,4387084.0,358.0,380.0,40.71755,-73.99521,380.0,40.71755,-73.99595,20293.0,1969.0,1.0
50%,8774169.0,605.0,505.0,40.73818,-73.98565,505.0,40.73756,-73.98602,28270.0,1981.0,1.0
75%,13161250.0,1060.0,3249.0,40.75763,-73.97283,3249.0,40.75725,-73.97344,31852.0,1989.0,1.0
max,17548340.0,19510050.0,3721.0,45.50636,-73.56891,3721.0,45.50636,-73.56891,35831.0,2002.0,2.0


In [5]:
data.isnull().sum()

Unnamed: 0                    0
tripduration                  0
starttime                     0
stoptime                      0
start_station_id           2497
start_station_latitude        0
start_station_longitude       0
end_station_id             2497
end_station_latitude          0
end_station_longitude         0
bikeid                        0
usertype                      0
birth_year                    0
gender                        0
dtype: int64

### Convert data types

In [6]:
date_columns = ["starttime", "stoptime"]
categorical_columns = ["gender", "usertype", "start_station_id", "end_station_id"]
data[date_columns] = data[date_columns].apply(pd.to_datetime)
data[categorical_columns] = data[categorical_columns].apply(lambda x: x.astype('category'))

### Drop unnecesary column

In [7]:
data = data.drop(columns=['Unnamed: 0'])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17548339 entries, 0 to 17548338
Data columns (total 13 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   tripduration             int64         
 1   starttime                datetime64[ns]
 2   stoptime                 datetime64[ns]
 3   start_station_id         category      
 4   start_station_latitude   float64       
 5   start_station_longitude  float64       
 6   end_station_id           category      
 7   end_station_latitude     float64       
 8   end_station_longitude    float64       
 9   bikeid                   int64         
 10  usertype                 category      
 11  birth_year               int64         
 12  gender                   category      
dtypes: category(4), datetime64[ns](2), float64(4), int64(3)
memory usage: 1.3 GB


In [9]:
data = data.sort_values(by='starttime').reset_index(drop=True)

### Add date information

In [10]:
data["pickup_hour"] = data["starttime"].dt.hour
data["pickup_day"] = data["starttime"].dt.date
data["dropoff_hour"] = data["stoptime"].dt.hour
data["dropoff_day"] = data["stoptime"].dt.date

In [11]:
display(data.head())

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,pickup_hour,pickup_day,dropoff_hour,dropoff_day
0,196,2018-01-01 00:01:50.650,2018-01-01 00:05:07.438,315.0,40.703554,-74.006702,259.0,40.701221,-74.012342,18534,Subscriber,1997,1,0,2018-01-01,0,2018-01-01
1,207,2018-01-01 00:02:43.918,2018-01-01 00:06:11.383,3224.0,40.739974,-74.005139,470.0,40.743453,-74.00004,19651,Subscriber,1978,1,0,2018-01-01,0,2018-01-01
2,613,2018-01-01 00:03:15.116,2018-01-01 00:13:28.480,386.0,40.714948,-74.002345,2008.0,40.705693,-74.016777,21678,Subscriber,1982,1,0,2018-01-01,0,2018-01-01
3,375,2018-01-01 00:06:43.709,2018-01-01 00:12:59.450,466.0,40.743954,-73.991449,325.0,40.736245,-73.984738,29822,Subscriber,1982,1,0,2018-01-01,0,2018-01-01
4,402,2018-01-01 00:06:56.924,2018-01-01 00:13:39.792,438.0,40.727791,-73.985649,380.0,40.734011,-74.002939,30722,Subscriber,1989,1,0,2018-01-01,0,2018-01-01


In [12]:
data.to_csv('processed_trips_2018.csv', index=False)