### Loading libraries, datasets

In [84]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

In [85]:
# These are all of the files you are given
raw_data = pd.read_csv("train.csv")

### Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing 
data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```

We are not doing anything with the MISSING_DATA. It is up to you to find a way to use (or ignore) that information.

In [86]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
raw_data["LEN"] = raw_data["POLYLINE"].apply(polyline_to_trip_duration)

In [87]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
raw_data[["YR", "MON", "DAY", "HR", "WK"]] = raw_data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

## Feature Engineering

* Create binary data: CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C,MISSING

In [88]:
# because test data does not have missing data, we filter out data that are missing

raw_data = raw_data[raw_data['MISSING_DATA'] == False]

In [89]:
# filter out LEN = 0
raw_data = raw_data[raw_data['LEN'] != 0]

In [90]:
# construct CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,DAY_TYPE_A,DAY_TYPE_B,DAY_TYPE_C
def create_binary_type_a(x):
    if x == "A":
        return 1
    else:
        return 0
    
def create_binary_type_b(x):
    if x == "B":
        return 1
    else:
        return 0

def create_binary_type_c(x):
    if x == "C":
        return 1
    else:
        return 0
raw_data['CALL_TYPE_A'] = raw_data['CALL_TYPE'].apply(create_binary_type_a)
raw_data['CALL_TYPE_B'] = raw_data['CALL_TYPE'].apply(create_binary_type_b)
raw_data['CALL_TYPE_C'] = raw_data['CALL_TYPE'].apply(create_binary_type_c)

In [91]:
# construct MISSING DATA binary variable
# def create_binary_missing(x):
#     if x is True:
#         return 1
#     else:
#         return 0

# raw_data['MISSING'] = raw_data['MISSING_DATA'].apply(create_binary_missing)

* Create interaction features: CALL_A_DAY_A, CALL_A_DAY_B, CALL_A_DAY_C, CALL_B_DAY_A, CALL_B_DAY_B, CALL_B_DAY_C

In [92]:
# raw_data['CALL_A_DAY_A'] = raw_data['CALL_TYPE_A'] * raw_data['DAY_TYPE_A']
# raw_data['CALL_A_DAY_B'] = raw_data['CALL_TYPE_A'] * raw_data['DAY_TYPE_B']
# raw_data['CALL_A_DAY_C'] = raw_data['CALL_TYPE_A'] * raw_data['DAY_TYPE_C']
# raw_data['CALL_B_DAY_A'] = raw_data['CALL_TYPE_B'] * raw_data['DAY_TYPE_A']
# raw_data['CALL_B_DAY_B'] = raw_data['CALL_TYPE_B'] * raw_data['DAY_TYPE_B']
# raw_data['CALL_B_DAY_C'] = raw_data['CALL_TYPE_B'] * raw_data['DAY_TYPE_C']

* Create time-based features: MORNING, AFTERNOON, EVENING, NIGHT, S1, S2, S3, S4

In [93]:
# deal with day variables
def morning(x):
    if x >=8 and x < 12:
        return 1
    else:
        return 0

def afternoon(x):
    if x > 12 and x <= 18:
        return 1
    else:
        return 0

def evening(x):
    if x > 18 and x <= 24:
        return 1
    else:
        return 0

def night(x):
    if x >=0 and x < 8:
        return 1
    else:
        return 0
    
raw_data['MORNING'] = raw_data['HR'].apply(morning)
raw_data['AFTERNOON'] = raw_data['HR'].apply(afternoon)
raw_data['EVENING'] = raw_data['HR'].apply(evening)
raw_data['NIGHT'] = raw_data['HR'].apply(night)

In [94]:
def season1(x):
    if x>=1 and x < 4:
        return 1
    else:
        return 0

def season2(x):
    if x>=4 and x < 7:
        return 1
    else:
        return 0
    
def season3(x):
    if x>=7 and x < 10:
        return 1
    else:
        return 0
    
def season4(x):
    if x>=10 and x <= 12:
        return 1
    else:
        return 0

raw_data['S1'] = raw_data['MON'].apply(season1)
raw_data['S2'] = raw_data['MON'].apply(season2)
raw_data['S3'] = raw_data['MON'].apply(season3)
raw_data['S4'] = raw_data['MON'].apply(season4)

In [95]:
# process datetime data in more details

def hour_sin(x):
    return np.sin(2 * np.pi * x/24.0)

def hour_cos(x):
    return np.cos(2 * np.pi * x /24.0)

def day_sin(x):
    return np.sin(2 * np.pi * x/30.0)

def day_cos(x):
    return np.cos(2 * np.pi * x /30.0)

def week_sin(x):
    return np.sin(2 * np.pi * x/7.0)

def week_cos(x):
    return np.cos(2 * np.pi * x /7.0)

def month_sin(x):
    return np.sin(2 * np.pi * x/12.0)

def month_cos(x):
    return np.cos(2 * np.pi * x /12.0)

def year_dummy(x):
    if x == 2013:
        return 0
    else:
        return 1

# raw_data["HR_SIN"] = raw_data['HR'].apply(hour_sin)
# raw_data["HR_COS"] = raw_data['HR'].apply(hour_cos)
# raw_data["DAY_SIN"] = raw_data['DAY'].apply(day_sin)
# raw_data["DAY_COS"] = raw_data['DAY'].apply(day_cos)
# raw_data["WK_SIN"] = raw_data['WK'].apply(week_sin)
# raw_data["WK_COS"] = raw_data['WK'].apply(week_cos)
# raw_data["MON_SIN"] = raw_data['MON'].apply(month_sin)
# raw_data["MON_COS"] = raw_data['MON'].apply(month_cos)
raw_data["YR_DUMMY"] = raw_data['YR'].apply(year_dummy)


In [96]:
raw_data['CALL_A_MORN'] = raw_data['CALL_TYPE_A'] * raw_data['MORNING']
raw_data['CALL_A_AFTER'] = raw_data['CALL_TYPE_A'] * raw_data['AFTERNOON']
raw_data['CALL_A_EVE'] = raw_data['CALL_TYPE_A'] * raw_data['EVENING']
raw_data['CALL_A_NIT'] = raw_data['CALL_TYPE_A'] * raw_data['NIGHT']
raw_data['CALL_B_MORN'] = raw_data['CALL_TYPE_B'] * raw_data['MORNING']
raw_data['CALL_B_AFTER'] = raw_data['CALL_TYPE_B'] * raw_data['AFTERNOON']
raw_data['CALL_B_EVE'] = raw_data['CALL_TYPE_B'] * raw_data['EVENING']
raw_data['CALL_B_NIT'] = raw_data['CALL_TYPE_B'] * raw_data['NIGHT']
raw_data['CALL_C_MORN'] = raw_data['CALL_TYPE_C'] * raw_data['MORNING']
raw_data['CALL_C_AFTER'] = raw_data['CALL_TYPE_C'] * raw_data['AFTERNOON']
raw_data['CALL_C_EVE'] = raw_data['CALL_TYPE_C'] * raw_data['EVENING']
raw_data['CALL_C_NIT'] = raw_data['CALL_TYPE_C'] * raw_data['NIGHT']

raw_data['CALL_A_S1'] = raw_data['CALL_TYPE_A'] * raw_data['S1']
raw_data['CALL_A_S2'] = raw_data['CALL_TYPE_A'] * raw_data['S2']
raw_data['CALL_A_S3'] = raw_data['CALL_TYPE_A'] * raw_data['S3']
raw_data['CALL_A_S4'] = raw_data['CALL_TYPE_A'] * raw_data['S4']
raw_data['CALL_B_S1'] = raw_data['CALL_TYPE_B'] * raw_data['S1']
raw_data['CALL_B_S2'] = raw_data['CALL_TYPE_B'] * raw_data['S2']
raw_data['CALL_B_S3'] = raw_data['CALL_TYPE_B'] * raw_data['S3']
raw_data['CALL_B_S4'] = raw_data['CALL_TYPE_B'] * raw_data['S4']
raw_data['CALL_C_S1'] = raw_data['CALL_TYPE_C'] * raw_data['S1']
raw_data['CALL_C_S2'] = raw_data['CALL_TYPE_C'] * raw_data['S2']
raw_data['CALL_C_S3'] = raw_data['CALL_TYPE_C'] * raw_data['S3']
raw_data['CALL_C_S4'] = raw_data['CALL_TYPE_C'] * raw_data['S4']  

# raw_data['DAY_A_MORN'] = raw_data['DAY_TYPE_A'] * raw_data['MORNING']
# raw_data['DAY_A_AFTER'] = raw_data['DAY_TYPE_A'] * raw_data['AFTERNOON']
# raw_data['DAY_A_EVE'] = raw_data['DAY_TYPE_A'] * raw_data['EVENING']
# raw_data['DAY_A_NIT'] = raw_data['DAY_TYPE_A'] * raw_data['NIGHT']
# raw_data['DAY_B_MORN'] = raw_data['DAY_TYPE_B'] * raw_data['MORNING']
# raw_data['DAY_B_AFTER'] = raw_data['DAY_TYPE_B'] * raw_data['AFTERNOON']
# raw_data['DAY_B_EVE'] = raw_data['DAY_TYPE_B'] * raw_data['EVENING']
# raw_data['DAY_B_NIT'] = raw_data['DAY_TYPE_B'] * raw_data['NIGHT']
# raw_data['DAY_C_MORN'] = raw_data['DAY_TYPE_C'] * raw_data['MORNING']
# raw_data['DAY_C_AFTER'] = raw_data['DAY_TYPE_C'] * raw_data['AFTERNOON']
# raw_data['DAY_C_EVE'] = raw_data['DAY_TYPE_C'] * raw_data['EVENING']
# raw_data['DAY_C_NIT'] = raw_data['DAY_TYPE_C'] * raw_data['NIGHT']

# raw_data['DAY_A_S1'] = raw_data['DAY_TYPE_A'] * raw_data['S1']
# raw_data['DAY_A_S2'] = raw_data['DAY_TYPE_A'] * raw_data['S2']
# raw_data['DAY_A_S3'] = raw_data['DAY_TYPE_A'] * raw_data['S3']
# raw_data['DAY_A_S4'] = raw_data['DAY_TYPE_A'] * raw_data['S4']
# raw_data['DAY_B_S1'] = raw_data['DAY_TYPE_B'] * raw_data['S1']
# raw_data['DAY_B_S2'] = raw_data['DAY_TYPE_B'] * raw_data['S2']
# raw_data['DAY_B_S3'] = raw_data['DAY_TYPE_B'] * raw_data['S3']
# raw_data['DAY_B_S4'] = raw_data['DAY_TYPE_B'] * raw_data['S4']
# raw_data['DAY_C_S1'] = raw_data['DAY_TYPE_C'] * raw_data['S1']
# raw_data['DAY_C_S2'] = raw_data['DAY_TYPE_C'] * raw_data['S2']
# raw_data['DAY_C_S3'] = raw_data['DAY_TYPE_C'] * raw_data['S3']
# raw_data['DAY_C_S4'] = raw_data['DAY_TYPE_C'] * raw_data['S4']


# raw_data['CALL_A_MISS'] = raw_data['CALL_TYPE_A'] * raw_data['MISSING']
# raw_data['CALL_B_MISS'] = raw_data['CALL_TYPE_B'] * raw_data['MISSING']
# raw_data['CALL_C_MISS'] = raw_data['CALL_TYPE_C'] * raw_data['MISSING']

# raw_data['DAY_A_MISS'] = raw_data['DAY_TYPE_A'] * raw_data['MISSING'] 
# raw_data['DAY_B_MISS'] = raw_data['DAY_TYPE_B'] * raw_data['MISSING'] 
# raw_data['DAY_C_MISS'] = raw_data['DAY_TYPE_C'] * raw_data['MISSING'] 


In [97]:
unique_taxi_id = np.unique(raw_data["TAXI_ID"]).tolist()
def set_id_category(x):
    return unique_taxi_id.index(x)

raw_data["TAXI_ID_CAT"] = raw_data['TAXI_ID'].apply(set_id_category) 


In [98]:
raw_data.columns

Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'LEN', 'YR', 'MON',
       'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C',
       'MORNING', 'AFTERNOON', 'EVENING', 'NIGHT', 'S1', 'S2', 'S3', 'S4',
       'YR_DUMMY', 'CALL_A_MORN', 'CALL_A_AFTER', 'CALL_A_EVE', 'CALL_A_NIT',
       'CALL_B_MORN', 'CALL_B_AFTER', 'CALL_B_EVE', 'CALL_B_NIT',
       'CALL_C_MORN', 'CALL_C_AFTER', 'CALL_C_EVE', 'CALL_C_NIT', 'CALL_A_S1',
       'CALL_A_S2', 'CALL_A_S3', 'CALL_A_S4', 'CALL_B_S1', 'CALL_B_S2',
       'CALL_B_S3', 'CALL_B_S4', 'CALL_C_S1', 'CALL_C_S2', 'CALL_C_S3',
       'CALL_C_S4', 'TAXI_ID_CAT'],
      dtype='object')

In [99]:
# new_data = raw_data.drop(['TRIP_ID','CALL_TYPE','TAXI_ID','TIMESTAMP','DAY_TYPE','MISSING_DATA', 'POLYLINE','YR', 'MON','DAY', 'HR', 'WK'],axis=1)

new_data = raw_data.drop(['TRIP_ID','CALL_TYPE','TAXI_ID','TIMESTAMP','DAY_TYPE','MISSING_DATA', 'POLYLINE','YR'],axis=1)

In [100]:
# set Nan value as 0
new_data.fillna(0,inplace=True)

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,LEN,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C,...,CALL_A_S4,CALL_B_S1,CALL_B_S2,CALL_B_S3,CALL_B_S4,CALL_C_S1,CALL_C_S2,CALL_C_S3,CALL_C_S4,TAXI_ID_CAT
131,8939.0,0.0,930,7,1,10,0,1,0,0,...,0,0,0,0,0,0,0,0,0,74
183,8939.0,0.0,540,7,1,11,0,1,0,0,...,0,0,0,0,0,0,0,0,0,258
208,6654.0,0.0,1395,7,1,10,0,1,0,0,...,0,0,0,0,0,0,0,0,0,118
463,6584.0,0.0,1035,7,1,12,0,1,0,0,...,0,0,0,0,0,0,0,0,0,206
726,14045.0,0.0,675,7,1,15,0,1,0,0,...,0,0,0,0,0,0,0,0,0,129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709199,6654.0,0.0,1200,7,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,93
1709976,4619.0,0.0,645,7,1,5,1,1,0,0,...,0,0,0,0,0,0,0,0,0,258
1710435,2024.0,0.0,255,7,1,7,1,1,0,0,...,0,0,0,0,0,0,0,0,0,238
1710451,2024.0,0.0,195,7,1,7,1,1,0,0,...,0,0,0,0,0,0,0,0,0,407


In [108]:
groupby_df = new_data.groupby("ORIGIN_CALL").filter(lambda x: (len(x) > 100))
unique_phone_id = np.unique(groupby_df["ORIGIN_CALL"]).tolist()
unique_phone_id = [int(x) for x in unique_phone_id]
def set_phone_category(x):
    if int(x) in unique_phone_id:
        return unique_phone_id.index(int(x)) + 1
    else:
        return 0

new_data["ORIGIN_CALL_CAT"] = new_data['ORIGIN_CALL'].apply(set_phone_category) 

In [110]:
import math
unique_stand_id = np.unique(new_data["ORIGIN_STAND"]).tolist()
unique_stand_id = [int(x) for x in unique_stand_id]

stand_location_info = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv",index_col="ID")
def set_stand_locate(x):
    if int(x) == 0:
        return None
    else:
        location = stand_location_info.loc[int(x),"Descricao"]
    return location

def set_stand_category(x):
    return unique_stand_id.index(int(x))

# def set_stand_lat(x):
#     if int(x) == 0:
#         return 0
#     else:
#         lat = stand_location_info.loc[int(x),"Latitude"]
#     return lat

# def set_stand_lot(x):
#     if int(x) == 0:
#         return 0
#     else:
#         lot = stand_location_info.loc[int(x),"Longitude"]
#     return lot

def calculate_euclidean_dist(lat1, lon1, lat2, lon2):
    # Calculate the square of the differences
    lat_diff_sq = (lat2 - lat1) ** 2
    lon_diff_sq = (lon2 - lon1) ** 2

    # Calculate the sum of the squares and take the square root
    distance = math.sqrt(lat_diff_sq + lon_diff_sq)
    return distance

def calculate_distance(x):
    if int(x) == 0:
        return -1
    lat = stand_location_info.loc[int(x),"Latitude"]
    lon = stand_location_info.loc[int(x),"Longitude"]

    lat0 = stand_location_info.loc[1,"Latitude"]
    lon0 = stand_location_info.loc[1,"Longitude"]

    distance = calculate_euclidean_dist(lat, lon, lat0, lon0)

    return distance
    

# new_data["STAND_LOCATE"] = new_data['ORIGIN_STAND'].apply(set_stand_locate)
new_data["ORIGIN_STAND_CAT"] = new_data['ORIGIN_STAND'].apply(set_stand_category)
new_data["DISTANCE"] = new_data['ORIGIN_STAND'].apply(calculate_distance)


In [111]:
new_data = new_data.drop(['ORIGIN_CALL','ORIGIN_STAND'],axis=1)

## Make target encoding

In [76]:
# import xam

# encoder = xam.feature_extraction.BayesianTargetEncoder(
#     columns=['ORIGIN_CALL_CAT'],
#     prior_weight=3,
#     suffix=''
# )

# new_data[['ORIGIN_CALL_CAT']] = encoder.fit_transform(new_data[['ORIGIN_CALL_CAT']],new_data['LEN'] )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[name + self.suffix] = x.map(posteriors).fillna(self.prior_).astype(float)


## Standardize the raw data

In [112]:
from sklearn.preprocessing import StandardScaler


scalar = StandardScaler()

# new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 
#           'MON_COS', 'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','DISTANCE']] = scalar.fit_transform(new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 'MON_COS', 
#                                                                                        'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','DISTANCE']])
new_data[['DISTANCE']] = scalar.fit_transform(new_data[['DISTANCE']])

# new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 
#           'MON_COS', 'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','ORIGIN_STAND_CAT','TAXI_ID_CAT']] = scalar.fit_transform(new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 'MON_COS', 
#                                                                                        'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','ORIGIN_STAND_CAT','TAXI_ID_CAT']])

## Save the raw data

In [113]:
new_data.to_csv("feature_eng_cat.csv")

## Process test data

In [114]:
test_data = pd.read_csv("test_public.csv")

In [115]:
test_data

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA
0,T1,B,,15.0,20000542,1408039037,A,False
1,T2,B,,57.0,20000108,1408038611,A,False
2,T3,B,,15.0,20000370,1408038568,A,False
3,T4,B,,53.0,20000492,1408039090,A,False
4,T5,B,,18.0,20000621,1408039177,A,False
...,...,...,...,...,...,...,...,...
315,T323,A,70885.0,,20000430,1419171485,A,False
316,T324,B,,53.0,20000020,1419170802,A,False
317,T325,C,,,20000207,1419172121,A,False
318,T326,A,76232.0,,20000667,1419171980,A,False


In [116]:
test_data[["YR", "MON", "DAY", "HR", "WK"]] = test_data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
test_data['CALL_TYPE_A'] = test_data['CALL_TYPE'].apply(create_binary_type_a)
test_data['CALL_TYPE_B'] = test_data['CALL_TYPE'].apply(create_binary_type_b)
test_data['CALL_TYPE_C'] = test_data['CALL_TYPE'].apply(create_binary_type_c)

# test_data['DAY_TYPE_A'] = test_data['DAY_TYPE'].apply(create_binary_type_a)
# test_data['DAY_TYPE_B'] = test_data['DAY_TYPE'].apply(create_binary_type_b)
# test_data['DAY_TYPE_C'] = test_data['DAY_TYPE'].apply(create_binary_type_c)
# test_data['MISSING'] = test_data['MISSING_DATA'].apply(create_binary_missing)

# test_data['CALL_A_DAY_A'] = test_data['CALL_TYPE_A'] * test_data['DAY_TYPE_A']
# test_data['CALL_A_DAY_B'] = test_data['CALL_TYPE_A'] * test_data['DAY_TYPE_B']
# test_data['CALL_A_DAY_C'] = test_data['CALL_TYPE_A'] * test_data['DAY_TYPE_C']
# test_data['CALL_B_DAY_A'] = test_data['CALL_TYPE_B'] * test_data['DAY_TYPE_A']
# test_data['CALL_B_DAY_B'] = test_data['CALL_TYPE_B'] * test_data['DAY_TYPE_B']
# test_data['CALL_B_DAY_C'] = test_data['CALL_TYPE_B'] * test_data['DAY_TYPE_C']

test_data['MORNING'] = test_data['HR'].apply(morning)
test_data['AFTERNOON'] = test_data['HR'].apply(afternoon)
test_data['EVENING'] = test_data['HR'].apply(evening)
test_data['NIGHT'] = test_data['HR'].apply(night)

test_data['S1'] = test_data['MON'].apply(season1)
test_data['S2'] = test_data['MON'].apply(season2)
test_data['S3'] = test_data['MON'].apply(season3)
test_data['S4'] = test_data['MON'].apply(season4)

# test_data["HR_SIN"] = test_data['HR'].apply(hour_sin)
# test_data["HR_COS"] = test_data['HR'].apply(hour_cos)
# test_data["DAY_SIN"] = test_data['DAY'].apply(day_sin)
# test_data["DAY_COS"] = test_data['DAY'].apply(day_cos)
# test_data["WK_SIN"] = test_data['WK'].apply(week_sin)
# test_data["WK_COS"] = test_data['WK'].apply(week_cos)
# test_data["MON_SIN"] = test_data['MON'].apply(month_sin)
# test_data["MON_COS"] = test_data['MON'].apply(month_cos)
test_data["YR_DUMMY"] = test_data['YR'].apply(year_dummy)

test_data['CALL_A_MORN'] = test_data['CALL_TYPE_A'] * test_data['MORNING']
test_data['CALL_A_AFTER'] = test_data['CALL_TYPE_A'] * test_data['AFTERNOON']
test_data['CALL_A_EVE'] = test_data['CALL_TYPE_A'] * test_data['EVENING']
test_data['CALL_A_NIT'] = test_data['CALL_TYPE_A'] * test_data['NIGHT']
test_data['CALL_B_MORN'] = test_data['CALL_TYPE_B'] * test_data['MORNING']
test_data['CALL_B_AFTER'] = test_data['CALL_TYPE_B'] * test_data['AFTERNOON']
test_data['CALL_B_EVE'] = test_data['CALL_TYPE_B'] * test_data['EVENING']
test_data['CALL_B_NIT'] = test_data['CALL_TYPE_B'] * test_data['NIGHT']
test_data['CALL_C_MORN'] = test_data['CALL_TYPE_C'] * test_data['MORNING']
test_data['CALL_C_AFTER'] = test_data['CALL_TYPE_C'] * test_data['AFTERNOON']
test_data['CALL_C_EVE'] = test_data['CALL_TYPE_C'] * test_data['EVENING']
test_data['CALL_C_NIT'] = test_data['CALL_TYPE_C'] * test_data['NIGHT']

test_data['CALL_A_S1'] = test_data['CALL_TYPE_A'] * test_data['S1']
test_data['CALL_A_S2'] = test_data['CALL_TYPE_A'] * test_data['S2']
test_data['CALL_A_S3'] = test_data['CALL_TYPE_A'] * test_data['S3']
test_data['CALL_A_S4'] = test_data['CALL_TYPE_A'] * test_data['S4']
test_data['CALL_B_S1'] = test_data['CALL_TYPE_B'] * test_data['S1']
test_data['CALL_B_S2'] = test_data['CALL_TYPE_B'] * test_data['S2']
test_data['CALL_B_S3'] = test_data['CALL_TYPE_B'] * test_data['S3']
test_data['CALL_B_S4'] = test_data['CALL_TYPE_B'] * test_data['S4']
test_data['CALL_C_S1'] = test_data['CALL_TYPE_C'] * test_data['S1']
test_data['CALL_C_S2'] = test_data['CALL_TYPE_C'] * test_data['S2']
test_data['CALL_C_S3'] = test_data['CALL_TYPE_C'] * test_data['S3']
test_data['CALL_C_S4'] = test_data['CALL_TYPE_C'] * test_data['S4']  

# test_data['DAY_A_MORN'] = test_data['DAY_TYPE_A'] * test_data['MORNING']
# test_data['DAY_A_AFTER'] = test_data['DAY_TYPE_A'] * test_data['AFTERNOON']
# test_data['DAY_A_EVE'] = test_data['DAY_TYPE_A'] * test_data['EVENING']
# test_data['DAY_A_NIT'] = test_data['DAY_TYPE_A'] * test_data['NIGHT']
# test_data['DAY_B_MORN'] = test_data['DAY_TYPE_B'] * test_data['MORNING']
# test_data['DAY_B_AFTER'] = test_data['DAY_TYPE_B'] * test_data['AFTERNOON']
# test_data['DAY_B_EVE'] = test_data['DAY_TYPE_B'] * test_data['EVENING']
# test_data['DAY_B_NIT'] = test_data['DAY_TYPE_B'] * test_data['NIGHT']
# test_data['DAY_C_MORN'] = test_data['DAY_TYPE_C'] * test_data['MORNING']
# test_data['DAY_C_AFTER'] = test_data['DAY_TYPE_C'] * test_data['AFTERNOON']
# test_data['DAY_C_EVE'] = test_data['DAY_TYPE_C'] * test_data['EVENING']
# test_data['DAY_C_NIT'] = test_data['DAY_TYPE_C'] * test_data['NIGHT']

# test_data['DAY_A_S1'] = test_data['DAY_TYPE_A'] * test_data['S1']
# test_data['DAY_A_S2'] = test_data['DAY_TYPE_A'] * test_data['S2']
# test_data['DAY_A_S3'] = test_data['DAY_TYPE_A'] * test_data['S3']
# test_data['DAY_A_S4'] = test_data['DAY_TYPE_A'] * test_data['S4']
# test_data['DAY_B_S1'] = test_data['DAY_TYPE_B'] * test_data['S1']
# test_data['DAY_B_S2'] = test_data['DAY_TYPE_B'] * test_data['S2']
# test_data['DAY_B_S3'] = test_data['DAY_TYPE_B'] * test_data['S3']
# test_data['DAY_B_S4'] = test_data['DAY_TYPE_B'] * test_data['S4']
# test_data['DAY_C_S1'] = test_data['DAY_TYPE_C'] * test_data['S1']
# test_data['DAY_C_S2'] = test_data['DAY_TYPE_C'] * test_data['S2']
# test_data['DAY_C_S3'] = test_data['DAY_TYPE_C'] * test_data['S3']
# test_data['DAY_C_S4'] = test_data['DAY_TYPE_C'] * test_data['S4']


# test_data['CALL_A_MISS'] = test_data['CALL_TYPE_A'] * test_data['MISSING']
# test_data['CALL_B_MISS'] = test_data['CALL_TYPE_B'] * test_data['MISSING']
# test_data['CALL_C_MISS'] = test_data['CALL_TYPE_C'] * test_data['MISSING']

# test_data['DAY_A_MISS'] = test_data['DAY_TYPE_A'] * test_data['MISSING'] 
# test_data['DAY_B_MISS'] = test_data['DAY_TYPE_B'] * test_data['MISSING'] 
# test_data['DAY_C_MISS'] = test_data['DAY_TYPE_C'] * test_data['MISSING'] 

test_data["TAXI_ID_CAT"] = test_data['TAXI_ID'].apply(set_id_category)

# one_hot_test = pd.get_dummies(test_data["TAXI_ID_CAT"])

# test_new_data = test_data.join(one_hot)

# new_data = test_data.drop(['TRIP_ID','CALL_TYPE','TAXI_ID','TIMESTAMP','DAY_TYPE','MISSING_DATA','YR', 'MON','DAY', 'HR', 'WK'],axis=1)
new_data = test_data.drop(['TRIP_ID','CALL_TYPE','TAXI_ID','TIMESTAMP','DAY_TYPE','MISSING_DATA','YR'],axis=1)

In [117]:
new_data.fillna(0,inplace=True)
def set_phone_category(x):
    if int(x) in unique_phone_id:
        return unique_phone_id.index(int(x))
    else:
        return 0

def set_stand_category(x):
    return unique_stand_id.index(int(x))

new_data["ORIGIN_CALL_CAT"] = new_data['ORIGIN_CALL'].apply(set_phone_category)
new_data["ORIGIN_STAND_CAT"] = new_data['ORIGIN_STAND'].apply(set_stand_category) 
new_data["DISTANCE"] = new_data['ORIGIN_STAND'].apply(calculate_distance)
new_data = new_data.drop(['ORIGIN_CALL','ORIGIN_STAND'],axis=1)

# new_data[['ORIGIN_CALL_CAT']] = encoder.transform(new_data[['ORIGIN_CALL_CAT']])

# new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 
#           'MON_COS', 'YR_DUMMY', 'WK_SIN', 'WK_COS',
#           'ORIGIN_CALL_CAT','ORIGIN_STAND_CAT','TAXI_ID_CAT']] = scalar.transform(new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 'MON_COS', 
#                                                                                        'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','ORIGIN_STAND_CAT','TAXI_ID_CAT']])

# new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 
#           'MON_COS', 'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','DISTANCE'
#           ]] = scalar.transform(new_data[['HR_SIN', 'HR_COS', 'DAY_SIN', 'DAY_COS','MON_SIN', 'MON_COS', 
#                                                                                        'YR_DUMMY', 'WK_SIN', 'WK_COS','ORIGIN_CALL_CAT','DISTANCE']])

new_data[['DISTANCE']] = scalar.transform(new_data[['DISTANCE']])

In [83]:
new_data.to_csv("test_public_features_cat.csv")