In [37]:
import pandas as pd

In [38]:
df_cabride = pd.read_csv('../Dataset/cab_rides.csv')
df_weather = pd.read_csv('../Dataset/weather.csv')

In [39]:
def remove_missing_values(df):
    return df.dropna()

df_cabride = remove_missing_values(df_cabride)
print("The Cab rides dataset shape: ", df_cabride.shape)

The Cab rides dataset shape:  (637976, 10)


In [40]:
def filling_missing_values(df):
    return df.fillna(value=0)

df_weather = filling_missing_values(df_weather)
print("The Weather dataset shape: ", df_weather.shape)

The Weather dataset shape:  (6276, 8)


In [41]:
def convert_timestamp_to_datetime(df, unit):
    df['datetime'] = pd.to_datetime(df['time_stamp'], unit=unit)
    df['date'] = df['datetime'].dt.date
    df['time'] = df['datetime'].dt.time
    df['date_hour'] = df['date'].astype(str) + ' ' + df['datetime'].dt.hour.astype(str)
    return df

df_cabride = convert_timestamp_to_datetime(df_cabride, 'ms')
df_weather = convert_timestamp_to_datetime(df_weather, 's')

print("The columns in Cab rides dataset: ", df_cabride.columns)
print("The columns in Weather dataset: ", df_weather.columns)

The columns in Cab rides dataset:  Index(['distance', 'cab_type', 'time_stamp', 'destination', 'source', 'price',
       'surge_multiplier', 'id', 'product_id', 'name', 'datetime', 'date',
       'time', 'date_hour'],
      dtype='object')
The columns in Weather dataset:  Index(['temp', 'location', 'clouds', 'pressure', 'rain', 'time_stamp',
       'humidity', 'wind', 'datetime', 'date', 'time', 'date_hour'],
      dtype='object')


In [43]:
def remain_1record_1hour(df):
    mean = df.groupby(['location', 'date_hour'])[['temp', 'clouds', 'pressure', 'rain', 'humidity', 'wind']].mean().reset_index()
    return mean

df_weather = remain_1record_1hour(df_weather)
print("The Weather dataset shape: ", df_weather.shape)
print(df_weather)

The Weather dataset shape:  (3960, 8)
      location      date_hour    temp  clouds  pressure    rain  humidity  \
0     Back Bay  2018-11-26 10  40.695   0.995  1014.800  0.0000     0.940   
1     Back Bay  2018-11-26 11  40.185   0.955  1015.305  0.0000     0.940   
2     Back Bay  2018-11-26 12  40.315   0.920  1015.275  0.0000     0.935   
3     Back Bay  2018-11-26 13  41.950   0.910  1015.200  0.0031     0.910   
4     Back Bay  2018-11-26 14  42.765   0.905  1014.695  0.0044     0.910   
...        ...            ...     ...     ...       ...     ...       ...   
3955  West End   2018-12-18 5  34.720   0.810  1001.740  0.0371     0.650   
3956  West End   2018-12-18 6  33.340   1.000  1002.440  0.0000     0.620   
3957  West End   2018-12-18 7  29.930   1.000  1003.150  0.0000     0.590   
3958  West End   2018-12-18 8  28.320   0.970  1005.300  0.0000     0.560   
3959  West End   2018-12-18 9  27.340   0.800  1006.060  0.0000     0.540   

        wind  
0      1.895  
1      

In [44]:
categories = {
    'UberX': 'standard',
    'Lyft': 'standard',
    'UberPool': 'shared ride',
    'Shared': 'shared ride',
    'UberXL': 'large vehicle',
    'Lyft XL': 'large vehicle',
    'Black': 'luxury',
    'Lux Black': 'luxury',
    'Black SUV': 'luxury SUV',
    'Lux Black XL': 'luxury SUV',
    'Lux': 'other',
    'WAV': 'other',
    'Taxi': 'other'
}

df_cabride['category'] = df_cabride['name'].map(categories)
print(df_cabride)

        distance cab_type     time_stamp    destination            source  \
0           0.44     Lyft  1544952607890  North Station  Haymarket Square   
1           0.44     Lyft  1543284023677  North Station  Haymarket Square   
2           0.44     Lyft  1543366822198  North Station  Haymarket Square   
3           0.44     Lyft  1543553582749  North Station  Haymarket Square   
4           0.44     Lyft  1543463360223  North Station  Haymarket Square   
...          ...      ...            ...            ...               ...   
693065      1.00     Uber  1543708385534      North End          West End   
693066      1.00     Uber  1543708385534      North End          West End   
693067      1.00     Uber  1543708385534      North End          West End   
693069      1.00     Uber  1543708385534      North End          West End   
693070      1.00     Uber  1543708385534      North End          West End   

        price  surge_multiplier                                    id  \
0 

In [45]:
def create_merge_data(df_cabride, df_weather):
    df_merge = pd.merge(df_cabride, df_weather, left_on=['source', 'date_hour'], right_on=['location', 'date_hour'])
    return df_merge

df_merge = create_merge_data(df_cabride, df_weather)
print(df_merge)

        distance cab_type     time_stamp    destination            source  \
0           0.44     Lyft  1544952607890  North Station  Haymarket Square   
1           0.44     Lyft  1543284023677  North Station  Haymarket Square   
2           0.44     Lyft  1543553582749  North Station  Haymarket Square   
3           0.44     Lyft  1543463360223  North Station  Haymarket Square   
4           0.44     Lyft  1545071112138  North Station  Haymarket Square   
...          ...      ...            ...            ...               ...   
635237      1.00     Uber  1543708385534      North End          West End   
635238      1.00     Uber  1543708385534      North End          West End   
635239      1.00     Uber  1543708385534      North End          West End   
635240      1.00     Uber  1543708385534      North End          West End   
635241      1.00     Uber  1543708385534      North End          West End   

        price  surge_multiplier                                    id  \
0 