### This script is used to characterize individuals' daily activity-travel patterns as categorical time series using data from NHTS 2017

In [None]:
#import libraries
from savReaderWriter import *
from pandas import *
import numpy as np
from collections import Counter
from dateutil.parser import parse
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
#Read data files
trp = read_csv('C:/Users/jiz13007/Documents/Profile of Existing Travelers and Exploring Transforability for AV futures/NHTS Datasets/2017/trippub.csv')
print trp.shape

In [None]:
print trp[['HOUSEID','PERSONID','TDTRPNUM','STRTTIME','ENDTIME','TRVLCMIN']].head(10)

In [None]:
# First is to check if all trips made by a person starts from 1
trp['CHECK_TRPNUM']=trp.groupby(['HOUSEID','PERSONID'])['TDTRPNUM'].transform('min')
print sum(trp['CHECK_TRPNUM']==1), sum(trp['CHECK_TRPNUM']!=1) # no issue was found, all person trip IDs start from 1

### Create atttributes

In [None]:
'''STRTTIME:Trip start Time
0000- 2359'''
'''ENDTIME: Trip end time'''
'''TRVLCMIN: Trip duration in minutes
-9: Not ascertain
0-1200'''

In [None]:
print trp['STRTTIME'].min(), trp['STRTTIME'].max()
print trp['ENDTIME'].min(), trp['ENDTIME'].max()
print trp['TRVLCMIN'].min(),trp['TRVLCMIN'].max()

In [None]:
''' Survey starts from 4:00 am to 3:59 am of the next day'''

In [None]:
trp['STRTTIME_R']=[a if a>=400 else (a+2400) for a in trp['STRTTIME'].tolist()]
trp['ENDTIME_R']=[a if a>=400 else (a+2400) for a in trp['ENDTIME'].tolist()]
print trp['STRTTIME_R'].min(), trp['ENDTIME_R'].min()
print trp['STRTTIME_R'].max(), trp['ENDTIME_R'].max()

In [None]:
# Convert STRTTIME for calculating activity duration
trp['STRT_HR']=trp['STRTTIME_R']//100
trp['STRT_MIN']=trp['STRTTIME_R']%100
trp['END_HR']=trp['ENDTIME_R']//100
trp['END_MIN']=trp['ENDTIME_R']%100
trp['STRTTIME_CONVERT']=trp['STRT_HR']*60+trp['STRT_MIN']
trp['ENDTIME_CONVERT']=trp['END_HR']*60+trp['END_MIN']
print trp['STRT_HR'].min(), trp['STRT_HR'].max()
print trp['STRT_MIN'].min(), trp['STRT_MIN'].max()
print trp['END_HR'].min(), trp['END_HR'].max()
print trp['END_MIN'].min(), trp['END_MIN'].max()
print trp['STRTTIME_CONVERT'].min(), trp['STRTTIME_CONVERT'].max()
print trp['ENDTIME_CONVERT'].min(), trp['ENDTIME_CONVERT'].max()
print trp[['TDTRPNUM','STRTTIME','START_HR','START_MIN','STRTTIME_CONVERT','ENDTIME','END_HR','END_MIN','ENDTIME_CONVERT']].head()

In [None]:
# Check if TRVLCMIN matches with endtime-strtime
trp['CHECK_TRVL']=trp['ENDTIME_CONVERT']-trp['STRTTIME_CONVERT']
print trp['CHECK_TRVL'].min(), trp['CHECK_TRVL'].max()
print sum((trp['CHECK_TRVL']!=trp['TRVLCMIN'])&(trp['TRVLCMIN']!=-9))
print trp.loc[(trp['CHECK_TRVL']!=trp['TRVLCMIN']),('HOUSEID','PERSONID','TDTRPNUM')].head()
print sum(trp['CHECK_TRVL']<=0), sum(trp['TRVLCMIN']==-9)

# NEED TO REMOVE ALL PERSONS WITH NEGATIVE OR ZERO TRIP DURATION

In [None]:
'''WHYTRP1S: Trip purpose summary
1- Home
10- Work
20- School
30- Medical
40- Shopping
50- Social
70- Transport someone
80- Meals
97- Something elase'''

In [None]:
## mandatory
mandatory_filter=trp['WHYTRP1S'].isin([10,20])
print sum(mandatory_filter)
trp.loc[mandatory_filter, 'TYPE']=2

## maintenance
maintenance_filter=trp['WHYTRP1S'].isin([30,40,70,80])
print sum(maintenance_filter)
trp.loc[maintenance_filter, 'TYPE']=3

## discretionary
discretionary_filter=trp['WHYTRP1S'].isin([50,97])
print sum(discretionary_filter)
trp.loc[discretionary_filter, 'TYPE']=4

# home
home_filter=trp['WHYTRP1S']==1
print sum(home_filter)
trp.loc[home_filter, 'TYPE']=1

print trp['TYPE'].value_counts().sort_index()
print trp['TYPE'].isnull().sum()
print trp['WHYTRP1S'].value_counts().sort_index()

In [None]:
'''TRPTRANS: Transportation mode used on trip
-9- Not ascertained
-8- I don't know
-7- I prefer not to answer
1- walk
2- bicycle
3- Car
4- SUV
5- Van
6- Pickup truck
7- Golf cart/segway
8- motorcycle
9- RV
10- school bus
11- public or commuter bus
12- paratransit 
13- private/charter/tour/shuttle bus
14- city to city bus
15- armtrack
16- subway/elevated light rail
17-Taxi
18- Rental car
19- Airplane
20- Boat
97- Something else
'''

In [None]:
#auto
auto_filter=trp['TRPTRANS'].isin([3,4,5,6,18])
print sum(auto_filter)
trp.loc[auto_filter,'MODE']=1
#public transit
public_filter=trp['TRPTRANS'].isin([11,14,15,16])
print sum(public_filter)
trp.loc[public_filter,'MODE']=2
#non-motorized
nonmotor_filter=trp['TRPTRANS'].isin([1,2])
print sum(nonmotor_filter)
trp.loc[nonmotor_filter,'MODE']=3
#other mode
other_filter=trp['TRPTRANS'].isin([7,8,9,10,12,13,17,19,20,97])
print sum(other_filter)
trp.loc[other_filter,'MODE']=4
print trp['MODE'].value_counts().sort_index()
print trp['MODE'].isnull().sum(),'invalid mode records need to be removed'
print trp['TRPTRANS'].value_counts().sort_index()

In [None]:
## create filter to inidcate invalid records
purfilter=trp['TYPE'].isnull()
print sum(purfilter), 'invalid purpose filter'
modefilter=trp['MODE'].isnull()
print sum(modefilter), 'invalid mode filter'
trpminfilter=(trp['CHECK_TRVL']<=0)
print sum(trpminfilter), 'invalid duration'

trp.loc[(purfilter|modefilter|trpminfilter),'invalid_indicator']=1
trp['invalid_indicator'].replace(np.nan, 0, inplace=True)
print sum(trp['invalid_indicator']==1),'invalid trips'

## mark all trips made by persons who had one or more invalid record
trp['invalid_per']=trp.groupby(['HOUSEID','PERSONID'])['invalid_indicator'].transform('max')
print sum(trp['invalid_per']==1), 'all trips need to be removed'

In [None]:
# remove invalid trips and create new dataframe
trp2=trp.loc[trp['invalid_per']==0,:]
print trp2.shape, trp.shape, len(trp)-len(trp2)
print trp2['STRTTIME_CONVERT'].max(), 'latest start time'
print trp2['ENDTIME_CONVERT'].max(),'latest arrivial time'
print trp2['CHECK_TRVL'].min(),'shortest trip duration'
print trp2['CHECK_TRVL'].max(), 'longest trip duration'
print trp2['TYPE'].isnull().sum(), 'nan activity types'
print trp2['MODE'].isnull().sum(), 'nan mode types'
print trp2['STRTTIME_CONVERT'].isnull().sum(), 'nan start time'
print trp2['ENDTIME_CONVERT'].isnull().sum(), 'nan end time'
print trp2['CHECK_TRVL'].isnull().sum(), 'nan travel time'

## Recode activity dwell time

In [None]:
trp2.sort_values(['HOUSEID','PERSONID','TDTRPNUM'], ascending=[True,True,True], inplace=True)

In [None]:
trp2['DWELTIME2']=trp2['DWELTIME']
trp2['DWELTIME2']=[x if x!=-9 else (1680-y) for x,y in zip(trp2['DWELTIME'].tolist(),trp2['ENDTIME_CONVERT'].tolist())]
trp2['FIRST_STRT']=[(x-240) if y==1 else 0 for x, y in zip(trp2['STRTTIME_CONVERT'].tolist(), trp2['TDTRPNUM'].tolist())]

In [None]:
print trp2[['HOUSEID','PERSONID','TDTRPNUM','STRTTIME','STRTTIME_CONVERT','ENDTIME','ENDTIME_CONVERT','TRVLCMIN','DWELTIME','DWELTIME2','FIRST_STRT']].head(10)

In [None]:
### check if the trip equals to 1440
trp2['SUM_DWEL']=trp2.groupby(['HOUSEID','PERSONID'])['DWELTIME2'].transform('sum')
trp2['SUM_TRVL']=trp2.groupby(['HOUSEID','PERSONID'])['CHECK_TRVL'].transform('sum')
trp2['SUM_FIR']=trp2.groupby(['HOUSEID','PERSONID'])['FIRST_STRT'].transform('first')
trp2['CHECK_DUR']=trp2['SUM_DWEL']+trp2['SUM_TRVL']+trp2['SUM_FIR']
print trp2['CHECK_DUR'].value_counts()
## remove the persons with invalid duration
trp3=trp2.loc[trp2['CHECK_DUR']==1440]
print trp3.shape

In [None]:
## remove the persons with invalid duration 
trp2['ZERO_DUR']=trp2.groupby(['HOUSEID','PERSONID'])['DWELTIME2'].transform('min')
print sum(trp2['ZERO_DUR']==0)
trp3=trp2.loc[(trp2['CHECK_DUR']==1440)|(trp2['ZERO_DUR']==0)]
print trp3.shape

In [None]:
# Recode activity type so that mode and actvity code can be difficiated
trp3.loc[trp3['MODE']==1,'MODE_RECODE']='A'
trp3.loc[trp3['MODE']==2,'MODE_RECODE']='B'
trp3.loc[trp3['MODE']==3,'MODE_RECODE']='C'
trp3.loc[trp3['MODE']==4,'MODE_RECODE']='D'
trp3.loc[trp3['TYPE']==1,'TYPE_RECODE']='E'
trp3.loc[trp3['TYPE']==2,'TYPE_RECODE']='F'
trp3.loc[trp3['TYPE']==3,'TYPE_RECODE']='G'
trp3.loc[trp3['TYPE']==4,'TYPE_RECODE']='H'
print trp3['TYPE_RECODE'].value_counts()
print trp3['TYPE'].value_counts()
print trp3['MODE_RECODE'].value_counts()
print trp3['MODE'].value_counts()
print trp3['TYPE_RECODE'].isnull().sum(),trp3['TYPE'].isnull().sum()
print trp3['MODE_RECODE'].isnull().sum(),trp3['MODE'].isnull().sum()

In [None]:
## Next is to create sequence/time series data for each individual
trp3['TRIPSEQ']=[str(a)*b for a,b in zip (trp3['MODE_RECODE'], trp3['CHECK_TRVL'])]
trp3['ACTSEQ']=[str(a)*b for a,b in zip (trp3['TYPE_RECODE'], trp3['DWELTIME2'])]
trp3['FIRSEQ']=['T'*b for b in trp3['FIRST_STRT']]
print trp3[['TDTRPNUM','MODE_RECODE','CHECK_TRVL','DWELTIME2','TRIPSEQ','ACTSEQ','FIRST_STRT','FIRSEQ']].head()

In [None]:
trp3['TRPSEQ_ALL']=trp3['FIRSEQ'].map(str)+trp3['TRIPSEQ'].map(str)
x=trp3.loc[0,'FIRSEQ']
x1=''.join(x)
print len(x1)
y=trp3.loc[0,'TRIPSEQ']
y1=''.join(y)
print len(y1)
z=trp3.loc[0,'TRPSEQ_ALL']
z1=''.join(z)
print len(z1)
print trp3[['FIRST_STRT','TRVLCMIN','DWELTIME2']].head()

In [None]:
trp3['ACTIVITY']=trp3['TRPSEQ_ALL'].map(str)+trp3['ACTSEQ'].map(str)
print trp3[['TDTRPNUM','TRPSEQ_ALL','ACTSEQ','ACTIVITY']].head()
perseq=trp3.groupby(['HOUSEID','PERSONID'])['ACTIVITY'].apply(list)
print perseq.head()

In [None]:
perseq=DataFrame(perseq)
perseq.reset_index(inplace=True)
print perseq.head()
print perseq.shape

In [None]:
# check if all sequences have length of 1440
indexLS=perseq.index.tolist()
for i in indexLS:
    seq=perseq.loc[i,'ACTIVITY'] # list of sequence
    seq1=''.join(seq)
    cnt=len(seq1)
    perseq.loc[i,'TXTSEQUENCE']=seq1
    perseq.loc[i,'SEQ_LENGTH']=cnt

print perseq['SEQ_LENGTH'].value_counts()

### Write Data

In [None]:
perseq.to_pickle('C:/Users/jiz13007/Documents/Pattern Recognition/NHTS sequence/2017_sequence.pkl')
trp3.to_pickle('C:/Users/jiz13007/Documents/Pattern Recognition/NHTS sequence/2017_trips.pkl')