# Covert trips into activities and create commuting matrices
1. Check typical trip chains of the travel survey.
2. Design filters for preparing the data for activities.
3. Extract activities from trip chains.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mobi-social-segregation-se

D:\mobi-social-segregation-se


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import datetime as dt
from tqdm import tqdm

In [3]:
df = pd.read_csv("dbs/survey/day_trips.csv", encoding="latin-1")
df = df.drop_duplicates(subset=['sub_id', 'trip_id', 'trip_main_id'])
df = df.loc[:, ["sub_id", "date", 'trip_id', "trip_main_id", 'purpose_main', 'purpose', 'origin_main_time', 'desti_main_time', 'origin_main_deso',  'desti_main_deso', 'trip_main_mode']]
df = df.sort_values(by=['sub_id', 'trip_id', 'trip_main_id'])
print(df.sub_id.nunique())

38258


Drop those participants with incomplete information.

In [4]:
def completeness_check(data):
    if data.isnull().values.any():
        return pd.Series({'complt': 0})
    return pd.Series({'complt': 1})

In [5]:
tqdm.pandas()
df_complt = df.groupby("sub_id").progress_apply(completeness_check).reset_index()

100%|██████████| 38258/38258 [00:21<00:00, 1753.96it/s]


In [6]:
df = df.loc[df.sub_id.isin(df_complt.loc[df_complt.complt == 1, 'sub_id']), :]
for var in ['purpose', 'purpose_main', 'origin_main_time', 'desti_main_time', 'trip_main_mode']:
    df.loc[:, var] = df.loc[:, var].astype(int)
print('Valid participants: %s'%df.sub_id.nunique())
df.head()

Valid participants: 27122


Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
3930,20110111008,2011-01-08,1,1,25,25,1815,2359,2581A0020,2284C1060,88
3931,20110111013,2011-01-10,1,1,2,2,830,850,0180C2670,0180C4430,10
3934,20110111013,2011-01-10,2,1,16,2,1130,1140,0180C4430,0180C4870,1
3935,20110111013,2011-01-10,2,2,2,2,1220,1230,0180C4870,0180C4430,1
3936,20110111013,2011-01-10,3,1,16,2,1700,1705,0180C4430,0180C4430,1


purpose_main 2, 3, the rest = commuting (XW), School (S), and Other (O)

Recoding it to 2, 3, and 0

In [16]:
df.loc[:, 'purpose_main'] = df.loc[:, 'purpose_main'].apply(lambda x: 0 if x not in (2, 3) else x)

## 1. Trip chains

In [26]:
df_test = df.loc[df['sub_id'] == 20110111013, :]
df_test

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
3931,20110111013,2011-01-10,1,1,2,2,830,850,0180C2670,0180C4430,10
3934,20110111013,2011-01-10,2,1,16,2,1130,1140,0180C4430,0180C4870,1
3935,20110111013,2011-01-10,2,2,2,2,1220,1230,0180C4870,0180C4430,1
3936,20110111013,2011-01-10,3,1,16,2,1700,1705,0180C4430,0180C4430,1
3937,20110111013,2011-01-10,3,2,2,2,1720,1725,0180C4430,0180C4430,1
3938,20110111013,2011-01-10,4,1,2,2,2000,2020,0180C4430,0180C2670,10


In [27]:
df_test = df.loc[df['sub_id'] == 20110111036, :]
df_test

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
3969,20110111036,2011-01-09,1,1,25,25,730,745,1481C1310,1480C1610,501
3972,20110111036,2011-01-09,2,1,7,7,1620,1630,1480C1610,1480C2460,11
3974,20110111036,2011-01-09,2,2,25,7,1631,1830,1480C2460,1480C2350,1
3976,20110111036,2011-01-09,2,3,7,7,1835,1850,1480C2350,1480C1610,1
3980,20110111036,2011-01-09,3,1,2,2,2100,2110,1480C1610,1481C1310,501


In [28]:
df_test = df.loc[df['sub_id'] == 20110111049, :]
df_test

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
4007,20110111049,2011-01-04,1,1,2,2,720,740,1401C1090,1402C1180,501
4009,20110111049,2011-01-04,2,1,2,2,1600,1620,1402C1180,1401C1090,501
4010,20110111049,2011-01-04,3,1,17,17,1645,1652,1401C1090,1401C1020,1
4011,20110111049,2011-01-04,3,2,17,17,1820,1827,1401C1020,1401C1090,1


In [7]:
df_test = df.loc[df['sub_id'] == 20110111072, :]
df_test

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
4052,20110111072,2011-01-04,1,1,6,6,1100,1102,1401C1140,1401C1140,1
3975,20110111072,2011-01-04,1,2,6,6,1115,1120,1401C1140,1401C1140,1
3979,20110111072,2011-01-04,2,1,17,17,1300,1400,1401C1140,1401C1140,1


In [30]:
df_test = df.loc[df['sub_id'] == 20110121017, :]
df_test

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
4110,20110121017,2011-01-07,1,1,7,23,1800,1845,1490C1350,1490C1070,121
4115,20110121017,2011-01-07,1,2,23,23,1920,1945,1490C1070,1490C1350,516
4116,20110121017,2011-01-07,2,1,6,6,2030,2031,1490C1350,1490C1120,1
6,20110121017,2011-01-07,2,2,6,6,2040,2041,1490C1120,1490C1350,1


## 2. Extract individuals for activities

The individuals satisfying a combination of the below conditions will be selected for the activity extraction.

1. The last trip is commuting.
2. The last trip is a round trip.
3. The last trip and the same trip has the same purpose.
4. The first origin and the last destination are in the same DeSO zone

In [7]:
def individual_process(data):
    first_purpose = data['purpose'].values[0]
    last_purpose = data['purpose'].values[-1]
    last_trip_num = data['trip_id'].max()
    last_round_trip = int(len(data.loc[data['trip_id'] == last_trip_num, :]) > 1)
    same_zone = int(data['origin_main_deso'].values[0] == data['desti_main_deso'].values[-1])
    comm_end = int(last_purpose == 2)
    round_end = int(last_round_trip == 1)
    same_pur = int(first_purpose == last_purpose)
    return pd.Series({'same_zone':same_zone, 'comm_end':comm_end, 'round_end':round_end, 'same_pur': same_pur})

In [8]:
df_act_patterns = df.groupby('sub_id').apply(individual_process).reset_index()
df_act_patterns.head()

Unnamed: 0,sub_id,same_zone,comm_end,round_end,same_pur
0,20110111008,0,0,0,1
1,20110111013,1,1,0,1
2,20110111016,1,0,1,1
3,20110111020,1,1,1,1
4,20110111023,1,0,1,1


In [9]:
df_act_patterns.loc[:, 'any'] = df_act_patterns.apply(lambda row: int(row['comm_end'] + row['round_end'] + row['same_pur'] > 0), axis=1)

In [10]:
share_same_zone = df_act_patterns.loc[df_act_patterns.same_zone == 1, :].size / df_act_patterns.size *100
share_comm_end = df_act_patterns.loc[df_act_patterns.comm_end == 1, :].size / df_act_patterns.size *100
share_round_end = df_act_patterns.loc[df_act_patterns.round_end == 1, :].size / df_act_patterns.size *100
share_same_pur = df_act_patterns.loc[df_act_patterns.same_pur == 1, :].size / df_act_patterns.size *100
print("Share of participants with same start and end DeSO zone: %.2f"%share_same_zone)
print("Share of participants with same start and end purpose: %.2f"%share_same_pur)
print("Share of participants with last trip being commuting: %.2f"%share_comm_end)
print("Share of participants with last trip being a round trip: %.2f"%share_round_end)

Share of participants with same start and end DeSO zone: 90.78
Share of participants with same start and end purpose: 71.09
Share of participants with last trip being commuting: 22.30
Share of participants with last trip being a round trip: 58.18


### 2.1 Participants filter
Participants with same start and end DeSO zone AND any of the three conditions is met: same start and end purpose, last trip being commuting, and last trip being a round trip.

In [11]:
selected_inds = df_act_patterns.loc[(df_act_patterns.same_zone == 1) & (df_act_patterns['any'] == 1), 'sub_id'].values
print("Share of participants selected: %.2f"%(len(selected_inds) / len(df_act_patterns) * 100))

Share of participants selected: 84.43


In [12]:
df_sub = df.loc[df.sub_id.isin(selected_inds), :].copy()

## 3. Convert trips into time series of activities
### 3.1 Time processing

In [15]:
df_sub.head()

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode
3931,20110111013,2011-01-10,1,1,2,2,830,850,0180C2670,0180C4430,10
3934,20110111013,2011-01-10,2,1,16,2,1130,1140,0180C4430,0180C4870,1
3935,20110111013,2011-01-10,2,2,2,2,1220,1230,0180C4870,0180C4430,1
3936,20110111013,2011-01-10,3,1,16,2,1700,1705,0180C4430,0180C4430,1
3937,20110111013,2011-01-10,3,2,2,2,1720,1725,0180C4430,0180C4430,1


In [16]:
def time_padding(x):
    if len(x) == 4:
        return x
    if len(x) == 3:
        return '0' + x
    if len(x) == 2:
        return '00' + x
    if len(x) == 1:
        return '000' + x

In [17]:
df_sub.loc[:, 'origin_main_T'] = df_sub['date'] + ' ' + df_sub['origin_main_time'].astype(str).apply(lambda x: time_padding(x))
df_sub.loc[:, 'desti_main_T'] = df_sub['date'] + ' ' + df_sub['desti_main_time'].astype(str).apply(lambda x: time_padding(x))

In [18]:
df_sub.loc[:, 'origin_main_T'] = df_sub['origin_main_T'].apply(lambda x: datetime.strptime(str(x),'%Y-%m-%d %H%M'))
df_sub.loc[:, 'desti_main_T'] = df_sub['desti_main_T'].apply(lambda x: datetime.strptime(str(x),'%Y-%m-%d %H%M'))

In [19]:
df_sub.loc[:, 'travel_time'] = df_sub.loc[:, 'desti_main_T'] - df_sub.loc[:, 'origin_main_T']
df_sub.loc[df_sub.travel_time < dt.timedelta(minutes=0), 'desti_main_T'] += dt.timedelta(minutes=1440)
df_sub.loc[:, 'travel_time'] = df_sub.loc[:, 'desti_main_T'] - df_sub.loc[:, 'origin_main_T']
df_sub.loc[:, 'travel_time'] /= dt.timedelta(minutes=1)

### 3.2 Create time series

In [15]:
df_sub_example = df_sub.loc[df_sub.sub_id == 20110111013, :].copy()
df_sub_example

Unnamed: 0,sub_id,date,trip_id,trip_main_id,purpose_main,purpose,origin_main_time,desti_main_time,origin_main_deso,desti_main_deso,trip_main_mode,origin_main_T,desti_main_T,travel_time
3931,20110111013,2011-01-10,1,1,2,2,830,850,0180C2670,0180C4430,10,2011-01-10 08:30:00,2011-01-10 08:50:00,20.0
3934,20110111013,2011-01-10,2,1,16,2,1130,1140,0180C4430,0180C4870,1,2011-01-10 11:30:00,2011-01-10 11:40:00,10.0
3935,20110111013,2011-01-10,2,2,2,2,1220,1230,0180C4870,0180C4430,1,2011-01-10 12:20:00,2011-01-10 12:30:00,10.0
3936,20110111013,2011-01-10,3,1,16,2,1700,1705,0180C4430,0180C4430,1,2011-01-10 17:00:00,2011-01-10 17:05:00,5.0
3937,20110111013,2011-01-10,3,2,2,2,1720,1725,0180C4430,0180C4430,1,2011-01-10 17:20:00,2011-01-10 17:25:00,5.0
3938,20110111013,2011-01-10,4,1,2,2,2000,2020,0180C4430,0180C2670,10,2011-01-10 20:00:00,2011-01-10 20:20:00,20.0


In [117]:
date_s = df_sub_example.date.iloc[0]
time_s = df_sub_example.origin_main_T.iloc[0]
time_e = df_sub_example.desti_main_T.iloc[-1]
date_e = str(time_e).split(' ')[0]

In [118]:
date_s, time_s, time_e, date_e

('2011-01-10',
 Timestamp('2011-01-10 08:30:00'),
 Timestamp('2011-01-10 20:20:00'),
 '2011-01-10')

In [138]:
if date_s == date_e:
    # The date's 00:00
    time_start = datetime.strptime(str(date_s),'%Y-%m-%d')
    # The next date's 00:00
    date_e_r = str(time_e + dt.timedelta(minutes=1440)).split(' ')[0]
    time_end = datetime.strptime(date_e_r,'%Y-%m-%d')
else:
    # 24 hours before the end date
    date_s_r = str(time_e - dt.timedelta(minutes=1440)).split(' ')[0]
    time_start = datetime.strptime(str(date_s_r),'%Y-%m-%d')
    time_end = datetime.strptime(date_e,'%Y-%m-%d')
activity_start_times = np.insert(df_sub_example.desti_main_T.values, 0, time_start, axis=0)
activity_end_times = np.insert(df_sub_example.origin_main_T.values, len(df_sub_example), time_end, axis=0)

In [139]:
purposes = np.insert(df_sub_example.purpose_main.values, 0, 0, axis=0)
purposes[len(purposes)-1] = 0

In [140]:
df_act = pd.DataFrame()
df_act.loc[:, 'act_start'] = activity_start_times
df_act.loc[:, 'act_end'] = activity_end_times
df_act.loc[:, 'purpose'] = purposes
df_act

Unnamed: 0,act_start,act_end,purpose
0,2011-01-10 00:00:00,2011-01-10 08:30:00,0
1,2011-01-10 08:50:00,2011-01-10 11:30:00,2
2,2011-01-10 11:40:00,2011-01-10 12:20:00,16
3,2011-01-10 12:30:00,2011-01-10 17:00:00,2
4,2011-01-10 17:05:00,2011-01-10 17:20:00,16
5,2011-01-10 17:25:00,2011-01-10 20:00:00,2
6,2011-01-10 20:20:00,2011-01-11 00:00:00,0


### 3.2.1 Activity conversion

In [20]:
def trips2activities(data):
    # Start and end time
    date_s = data.date.iloc[0]
    time_e = data.desti_main_T.iloc[-1]
    date_e = str(time_e).split(' ')[0]
    if date_s == date_e:
        # The date's 00:00
        time_start = datetime.strptime(str(date_s),'%Y-%m-%d')
        # The next date's 00:00
        date_e_r = str(time_e + dt.timedelta(minutes=1440)).split(' ')[0]
        time_end = datetime.strptime(date_e_r,'%Y-%m-%d')
    else:
        # 24 hours before the end date
        date_s_r = str(time_e - dt.timedelta(minutes=1440)).split(' ')[0]
        time_start = datetime.strptime(str(date_s_r),'%Y-%m-%d')
        time_end = datetime.strptime(date_e,'%Y-%m-%d')
    activity_start_times = np.insert(data.desti_main_T.values, 0, time_start, axis=0)
    activity_end_times = np.insert(data.origin_main_T.values, len(data), time_end, axis=0)
    # Purpose
    purposes = np.insert(data.purpose_main.values, 0, 0, axis=0)
    purposes[len(purposes)-1] = 0
    # DeSO zone
    zones = np.insert(data.desti_main_deso.values, 0, data.origin_main_deso.iloc[0], axis=0)
    # Create a dataframe
    df_act = pd.DataFrame()
    df_act.loc[:, 'act_start'] = activity_start_times
    df_act.loc[:, 'act_end'] = activity_end_times
    df_act.loc[:, 'purpose'] = purposes
    df_act.loc[:, 'zone'] = zones
    return df_act

In [21]:
tqdm.pandas()
df_act = df_sub.groupby('sub_id').progress_apply(trips2activities).reset_index()

100%|██████████| 22899/22899 [01:37<00:00, 233.80it/s]


In [22]:
df_act = df_act.drop(columns=['level_1'])

In [23]:
df_act.loc[:, 'h_s'] = df_act.loc[:, 'act_start'].apply(lambda x: x.hour * 60 + x.minute)
df_act.loc[:, 'h_e'] = df_act.loc[:, 'act_end'].apply(lambda x: x.hour * 60 + x.minute)
df_act.loc[:, 'dur'] = df_act['act_end'] - df_act['act_start']
df_act.loc[:, 'dur'] = df_act.loc[:, 'dur'].apply(lambda x: x.total_seconds() / 60)
df_act.head()

Unnamed: 0,sub_id,act_start,act_end,purpose,zone,h_s,h_e,dur
0,20110111013,2011-01-10 00:00:00,2011-01-10 08:30:00,0,0180C2670,0,510,510.0
1,20110111013,2011-01-10 08:50:00,2011-01-10 11:30:00,2,0180C4430,530,690,160.0
2,20110111013,2011-01-10 11:40:00,2011-01-10 12:20:00,16,0180C4870,700,740,40.0
3,20110111013,2011-01-10 12:30:00,2011-01-10 17:00:00,2,0180C4430,750,1020,270.0
4,20110111013,2011-01-10 17:05:00,2011-01-10 17:20:00,16,0180C4430,1025,1040,15.0


In [24]:
# Remove those with negative activity duration
df_act = df_act.loc[~df_act.sub_id.isin(df_act.loc[df_act.dur <= 0,'sub_id'].unique()), :]

In [28]:
pur_dict = {0: 'Home', 2: 'Work', 3: 'School'}
df_act.loc[:, 'Purpose'] = df_act.loc[:, 'purpose'].apply(lambda x: pur_dict[x] if x in pur_dict else 'Other')

In [29]:
df_act.to_csv('dbs/survey/day_act.csv', index=False)

## 4. Create commuting matrices
1. Municipality-level
2. DeSO-level

In [10]:
df = pd.read_csv("dbs/survey/day_trips.csv", encoding="latin-1")
df = df.drop_duplicates(subset=['sub_id'])
df = df.loc[:, ["sub_id", "sub_weight", 'home_deso', 'home_municipality', 'workplace_deso', 'workplace_municipality']]
len(df)

38258

### 4.1 Municipality level

In [13]:
df_mu = df[(df['home_municipality'].notna())&(df['workplace_municipality'].notna())]
len(df_mu)

19666

In [14]:
df_od = df_mu.groupby(["home_municipality", "workplace_municipality"])["sub_weight"].sum().reset_index()
df_od.columns = ["ozone", "dzone", "sv_commute"]
df_od.to_csv('dbs/survey/commute_od_municipality.csv', index=False, encoding='latin-1')

### 4.2 DeSO zone level

In [15]:
df_mu = df[(df['home_deso'].notna())&(df['workplace_deso'].notna())]
len(df_mu)

16200

In [16]:
df_od = df_mu.groupby(["home_deso", "workplace_deso"])["sub_weight"].sum().reset_index()
df_od.columns = ["ozone", "dzone", "sv_commute"]
df_od.to_csv('dbs/survey/commute_od_deso.csv', index=False, encoding='latin-1')

## 5. Create overall matrices
1. Municipality-level
2. DeSO-level

In [3]:
df = pd.read_csv("dbs/survey/day_trips.csv", encoding="latin-1")
df = df.drop_duplicates(subset=['sub_id'])
df = df.loc[:, ["sub_id", "sub_weight", 'origin_main_deso', 'desti_main_deso',
                'origin_main_municipality', 'desti_main_municipality']]
len(df)

38258

### 5.1 Municipality level

In [4]:
df_mu = df[(df['origin_main_municipality'].notna())&(df['desti_main_municipality'].notna())]
len(df_mu)

37831

In [9]:
df_od = df_mu.groupby(["origin_main_municipality", "desti_main_municipality"])["sub_weight"].sum().reset_index()
df_od.columns = ["ozone", "dzone", "sv"]
df_od = df_od.astype({'ozone':'int', 'dzone':'int'})
df_od.to_csv('dbs/survey/od_municipality.csv', index=False, encoding='latin-1')

In [11]:
df_mu = df[(df['origin_main_deso'].notna())&(df['desti_main_deso'].notna())]
len(df_mu)

29879

In [13]:
df_od = df_mu.groupby(["origin_main_deso", "desti_main_deso"])["sub_weight"].sum().reset_index()
df_od.columns = ["ozone", "dzone", "sv"]
df_od.to_csv('dbs/survey/od_deso.csv', index=False, encoding='latin-1')