In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.miscmodels.ordinal_model import OrderedModel

from sklearn import preprocessing

## 数据预处理

### move表中的自变量提取

In [121]:
move = pd.read_csv('data/move_201958.csv',parse_dates=['stime', 'etime'])

In [122]:
# 排序
move = move.sort_values(by=['pid', 'stime']).reset_index(drop=True)

# 市内出行，删除飞机和铁路
move = move.drop(index = move[(move['mode'].isin([2,3]))].index.tolist())

# 为出行加入周信息,保留完整周信息（19~35周）
move['week'] = [data.week for data in move['etime']]
move = move.query('week>=19 and week<=35')

In [123]:
# 构建地铁哑变量
move['subway'] = pd.get_dummies(move['mode'],columns='mode',prefix='mode')['mode_4']

In [124]:
move.head()

Unnamed: 0,pid,stime,etime,mode,time,distance,week,subway
10,90221,2019-05-07 11:44:26,2019-05-07 12:09:41,1,1515,0,19,0
11,90221,2019-05-07 13:39:58,2019-05-07 13:41:28,1,90,2828,19,0
12,90221,2019-05-07 16:42:16,2019-05-07 16:43:47,1,91,2828,19,0
13,90221,2019-05-08 15:18:43,2019-05-08 15:42:31,1,1428,4301,19,0
14,90221,2019-05-08 21:52:21,2019-05-08 22:32:40,1,2419,4301,19,0


In [125]:
# 分组求每个周轨迹中总出行时间和/平均出行时间/出行总距离/出行平均距离/地铁占比
move_group = move.groupby(['pid','week']).agg(time_sum=('time','sum'),
                                 time_mean=('time','mean'),
                                 distance_sum=('distance','sum'),
                                 distance_mean=('distance','mean'),
                                 subway_ratio=('subway',lambda x: sum(x)/len(x))).reset_index()

In [126]:
move_group

Unnamed: 0,pid,week,time_sum,time_mean,distance_sum,distance_mean,subway_ratio
0,90221,19,37722,1714.636364,162497,7386.227273,0.000000
1,90221,20,30951,1719.500000,60064,3336.888889,0.000000
2,90221,21,11680,687.058824,118519,6971.705882,0.000000
3,90221,22,22097,849.884615,154483,5941.653846,0.000000
4,90221,23,7487,415.944444,118921,6606.722222,0.000000
...,...,...,...,...,...,...,...
3190,38092985,31,8856,885.600000,84582,8458.200000,0.200000
3191,38092985,32,4596,383.000000,91472,7622.666667,0.000000
3192,38092985,33,40716,1938.857143,352256,16774.095238,0.238095
3193,38092985,34,7492,499.466667,70075,4671.666667,0.333333


### act表中的自变量提取

In [6]:
act = pd.read_csv('data/act_201958.csv',parse_dates=['t_start', 't_end'])

In [7]:
# 删除居家活动
act = act.drop(index = act[(act['ptype']==0)].index.tolist())

# 计算出行时长
act_begin = [data.hour*60+data.minute for data in act['t_start']]
act_end = [data.hour*60+data.minute for data in act['t_end']]
act['act_duration'] = np.array(act_end) - np.array(act_begin)

In [8]:
# 构建工作哑变量
act['work'] = pd.get_dummies(act['ptype'],columns='ptype',prefix='ptype')['ptype_2']

In [9]:
act.head()

Unnamed: 0,pid,date,t_start,t_end,poi_id,community_id,ptype,longitude,latitude,week,weekday,month,act_duration,work
0,90221,2019-05-06 00:00:00,2019-05-06 07:28:00,2019-05-06 22:51:56,1,2291,2,114.185092,22.651382,19,0,5,923,1
3,90221,2019-05-07 00:00:00,2019-05-07 13:41:28,2019-05-07 16:42:16,1,2291,2,114.185092,22.651382,19,1,5,181,1
6,90221,2019-05-08 00:00:00,2019-05-08 15:42:31,2019-05-08 21:52:21,3,2291,3,114.203721,22.614077,19,2,5,370,0
8,90221,2019-05-09 00:00:00,2019-05-09 10:29:46,2019-05-09 12:12:52,1,2291,2,114.185092,22.651382,19,3,5,103,1
10,90221,2019-05-09 00:00:00,2019-05-09 13:55:28,2019-05-09 15:23:39,1,2291,2,114.185092,22.651382,19,3,5,88,1


In [10]:
# 分组求每个周外出活动中平均驻留时间/外出工作活动占比（既作为自变量，也用于了解其就业状况）
act_group = act.groupby(['pid','week']).agg(duration_mean=('act_duration','mean'),
                                 work_ratio=('work',lambda x: sum(x)/len(x))).reset_index()

In [132]:
act_group

Unnamed: 0,pid,week,duration_mean,work_ratio
0,90221,19,260.571429,0.857143
1,90221,20,160.800000,0.800000
2,90221,21,190.800000,1.000000
3,90221,22,127.000000,0.571429
4,90221,24,141.750000,0.000000
...,...,...,...,...
1300,37544410,31,87.500000,0.000000
1301,38092985,19,212.333333,0.000000
1302,38092985,20,351.000000,0.000000
1303,38092985,21,221.153846,0.000000


### 个人属性自变量

In [2]:
#导入个人信息
pid_attri = pd.read_csv('data/pid_attri.csv')
pid_attri.head()

Unnamed: 0,pid,gender,age,arpu,brand
0,9620253,2,8,108.0,华为
1,9620253,2,8,113.92,华为
2,9620253,2,8,148.0,华为
3,9620253,2,8,110.88,华为
4,16350246,1,10,28.0,华为


In [3]:
pid_uniattri = pid_attri.groupby('pid').agg(gender=('gender','mean'),
                          age=('age','mean')).reset_index()

In [4]:
pid_uniattri

Unnamed: 0,pid,gender,age
0,90221,1,11
1,117999,2,9
2,684918,2,8
3,935307,1,7
4,1209601,1,9
...,...,...,...
187,37089525,2,5
188,37129431,1,6
189,37166199,2,8
190,37544410,1,8


In [136]:
act_group

Unnamed: 0,pid,week,duration_mean,work_ratio
0,90221,19,260.571429,0.857143
1,90221,20,160.800000,0.800000
2,90221,21,190.800000,1.000000
3,90221,22,127.000000,0.571429
4,90221,24,141.750000,0.000000
...,...,...,...,...
1300,37544410,31,87.500000,0.000000
1301,38092985,19,212.333333,0.000000
1302,38092985,20,351.000000,0.000000
1303,38092985,21,221.153846,0.000000


In [11]:
# 添加就业状况
pid_work = act_group.groupby('pid')['work_ratio'].agg('mean').reset_index()

In [12]:
pid_uniattri['employment'] = [0 if data==0 else 1 for data in pid_work['work_ratio']]

In [13]:
pid_uniattri

Unnamed: 0,pid,gender,age,employment
0,90221,1,11,1
1,117999,2,9,1
2,684918,2,8,0
3,935307,1,7,1
4,1209601,1,9,0
...,...,...,...,...
187,37089525,2,5,0
188,37129431,1,6,1
189,37166199,2,8,1
190,37544410,1,8,0


### 轨迹中的自变量提取
1. 活动顺序，包括HWH \ HOH \ HOWH \ HWOH
2. 出行链，即一次出行的驻留次数
3. 出行频次，counts of home tour

In [140]:
trajectory = np.loadtxt('data/trajectory_in_week_3ptype_2019_58.txt')
len(trajectory)

1316

In [141]:
# step2_pattern_clustering_v2的函数
def act_interval(t):
    starts = [0]
    ends = []
    labels = []
    start= 0
    for i in range(1,len(t)):
        if t[i]!=t[i-1]:
            end = i-1
            ends.append(end)
            labels.append(t[i-1])
            start = i
            starts.append(start)
    #最后一段记录
    end = i
    ends.append(end)
    labels.append(t[i])
    
    return starts,ends,labels

In [143]:
# 获取简单出行链数量、平均出行链驻留次数（出行是home tour）、周出行频次
def exact_chain(labels):
    labels = np.array(labels)
    #删除出行标签
    labels = np.delete(labels,np.where(labels==1))
    #获取0索引
    home_index = np.argwhere(labels==0)
    home_index = home_index.reshape(len(home_index))
    tour_count = 0
    tour_stop = 0
    hwh = 0; hoh = 0; h2h_dif = 0; h2h_same = 0;
    for i in range(0,len(home_index)-1):
        # 定位一个home tour
        i_beg = home_index[i]
        i_end = home_index[i+1]
        if (i_end > i_beg+1): #确保中间有出行
            # 周出行频次累加
            tour_count += 1
            # 驻留次数累加
            tour_stop += i_end - i_beg -1
            # 判断出行链成分
            tour = labels[i_beg+1:i_end]
            if (len(tour)<=2):
                if (tour == [2]).all():
                    hwh += 1
                elif (tour == [3]).all():
                    hoh += 1
                elif (tour==[2,3]).all() or (tour==[3,2]).all():
                    h2h_dif += 1
                else:
                    h2h_same += 1
    
    # home-home的出行不算出行
    if tour_count==0:
        stop_ratio = 0
    else:
        stop_ratio = tour_stop/tour_count
                    
    #返回各简单出行链数量、平均出行驻留次数、总出行驻留次数、周出行频次
    return hwh,hoh,h2h_dif,h2h_same,stop_ratio,tour_stop,tour_count    

In [145]:
HWH=[]; HOH=[]; H2H_dif=[]; H2H_same=[]; Avg_Stop=[]; Count_Tour=[]; Count_Stop=[]
for index,data in enumerate(trajectory):
    _,_,labels = act_interval(data)
    hwh,hoh,h2h_dif,h2h_same,stop_ratio,tour_stop,tour_count = exact_chain(labels)
    HWH.append(hwh)
    HOH.append(hoh)
    H2H_dif.append(h2h_dif)
    H2H_same.append(h2h_same)
    Avg_Stop.append(stop_ratio)
    Count_Stop.append(tour_stop)
    Count_Tour.append(tour_count)

In [146]:
Tour_attri = {'HWH_count':HWH,
             'HOH_count':HOH,
             'H2H_dif_count':H2H_dif,
             'H2H_same_count':H2H_same,
             'Stop_mean':Avg_Stop,
              'Stop_count':Count_Stop,
             'Tour_count':Count_Tour}

Tour_attri = pd.DataFrame(Tour_attri)

In [147]:
Tour_attri

Unnamed: 0,HWH_count,HOH_count,H2H_dif_count,H2H_same_count,Stop_mean,Stop_count,Tour_count
0,6,1,0,0,1.000000,7,7
1,4,1,0,0,1.000000,5,5
2,4,0,0,0,1.000000,4,4
3,4,3,2,0,1.222222,11,9
4,0,3,0,0,1.000000,3,3
...,...,...,...,...,...,...,...
1311,0,2,0,0,1.500000,3,2
1312,0,7,0,0,1.142857,8,7
1313,0,3,0,0,1.500000,6,4
1314,0,8,0,0,1.000000,8,8


### 特征合并

In [14]:
pid_week = pd.read_csv('data/pid_week_201958.csv')
pid_week.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   pid     1316 non-null   int64
 1   week    1316 non-null   int64
dtypes: int64(2)
memory usage: 20.7 KB


In [15]:
# 特征合并
Data = pd.merge(pid_week,pid_uniattri,how='left',on='pid')

In [16]:
Data = pd.merge(Data,move_group,how='left',on=['pid','week'])

NameError: name 'move_group' is not defined

In [161]:
Data = pd.merge(Data,act_group,how='left',on=['pid','week'])

In [162]:
Data[['HWH_count','HOH_count','H2H_dif_count','H2H_same_count','Stop_mean','Stop_count','Tour_count']]=Tour_attri

In [163]:
Data = Data.fillna(0)

In [164]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1316 entries, 0 to 1315
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pid             1316 non-null   int64  
 1   week            1316 non-null   int64  
 2   gender          1316 non-null   int64  
 3   age             1316 non-null   int64  
 4   employment      1316 non-null   int64  
 5   time_sum        1316 non-null   float64
 6   time_mean       1316 non-null   float64
 7   distance_sum    1316 non-null   float64
 8   distance_mean   1316 non-null   float64
 9   subway_ratio    1316 non-null   float64
 10  duration_mean   1316 non-null   float64
 11  work_ratio      1316 non-null   float64
 12  HWH_count       1316 non-null   int64  
 13  HOH_count       1316 non-null   int64  
 14  H2H_dif_count   1316 non-null   int64  
 15  H2H_same_count  1316 non-null   int64  
 16  Stop_mean       1316 non-null   float64
 17  Stop_count      1316 non-null   i

In [165]:
Data.to_csv('data/regression_variable.csv',index=False)

### 标签划分

In [17]:
data = pd.read_csv('data/regression_variable.csv')
data['H2H_count'] = data['H2H_same_count']+data['H2H_dif_count']
data.head()

Unnamed: 0,pid,week,gender,age,employment,time_sum,time_mean,distance_sum,distance_mean,subway_ratio,duration_mean,work_ratio,HWH_count,HOH_count,H2H_dif_count,H2H_same_count,Stop_mean,Stop_count,Tour_count,H2H_count
0,90221,19,1,11,1,37722.0,1714.636364,162497.0,7386.227273,0.0,260.571429,0.857143,6,1,0,0,1.0,7,7,0
1,90221,20,1,11,1,30951.0,1719.5,60064.0,3336.888889,0.0,160.8,0.8,4,1,0,0,1.0,5,5,0
2,90221,21,1,11,1,11680.0,687.058824,118519.0,6971.705882,0.0,190.8,1.0,4,0,0,0,1.0,4,4,0
3,90221,22,1,11,1,22097.0,849.884615,154483.0,5941.653846,0.0,127.0,0.571429,4,3,2,0,1.222222,11,9,2
4,90221,24,1,11,1,3455.0,265.769231,28516.0,2193.538462,0.0,141.75,0.0,0,3,0,0,1.0,3,3,0


In [18]:
trajectory_labels = np.loadtxt('data/trajectory_labels.txt')
pd.value_counts(trajectory_labels)

2.0    421
1.0    402
3.0    259
5.0    178
4.0     52
0.0      4
dtype: int64

In [19]:
data0 = pd.concat([data.iloc[np.where(trajectory_labels == 0)],data.iloc[np.where(trajectory_labels == 4)]]) #0和4归为一类
data1 = data.iloc[np.where(trajectory_labels==1)]
data2 = data.iloc[np.where(trajectory_labels==2)]
data3 = data.iloc[np.where(trajectory_labels==3)]
data5 = data.iloc[np.where(trajectory_labels==5)]

In [23]:
pd.value_counts(data['employment'])/len(data)

1    0.765957
0    0.234043
Name: employment, dtype: float64

In [25]:
pd.value_counts(data0['employment'])/len(data0)

0    0.535714
1    0.464286
Name: employment, dtype: float64

In [26]:
pd.value_counts(data1['employment'])/len(data1)

1    0.905473
0    0.094527
Name: employment, dtype: float64

In [27]:
pd.value_counts(data2['employment'])/len(data2)

1    0.748219
0    0.251781
Name: employment, dtype: float64

In [28]:
pd.value_counts(data3['employment'])/len(data3)

1    0.810811
0    0.189189
Name: employment, dtype: float64

In [29]:
pd.value_counts(data5['employment'])/len(data5)

1    0.522472
0    0.477528
Name: employment, dtype: float64

## 泊松回归拟合出行频次

### data0 = pattern 5

In [20]:
glm0 = smf.glm('Tour_count~gender + age + employment + time_mean + subway_ratio + duration_mean + work_ratio + HWH_count + HOH_count + H2H_count + Stop_mean', data0, family=sm.families.Poisson())
res_quan0 = glm0.fit()
print(res_quan0.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Tour_count   No. Observations:                   56
Model:                            GLM   Df Residuals:                       44
Model Family:                 Poisson   Df Model:                           11
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -71.617
Date:                Mon, 25 Apr 2022   Deviance:                       6.0838
Time:                        11:06:17   Pearson chi2:                     4.44
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.7624      0.582     -1.311

In [21]:
res_quan0.aic

167.23438066754147

### data1 = pattern 1

In [144]:
glm1 = smf.glm('Tour_count~gender + age + employment + time_mean + subway_ratio + duration_mean + work_ratio + HWH_count + HOH_count + H2H_count + Stop_mean', data1, family=sm.families.Poisson())
res_quan1 = glm1.fit()
print(res_quan1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Tour_count   No. Observations:                  402
Model:                            GLM   Df Residuals:                      390
Model Family:                 Poisson   Df Model:                           11
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -772.28
Date:                Sun, 24 Apr 2022   Deviance:                       20.734
Time:                        22:20:33   Pearson chi2:                     20.5
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.7738      0.259      2.988

In [145]:
res_quan1.aic

1568.5631409404346

### data2 = pattern 3

In [146]:
glm2 = smf.glm('Tour_count~gender + age + employment + time_mean + subway_ratio + duration_mean + work_ratio + HWH_count + HOH_count + H2H_count + Stop_mean', data2, family=sm.families.Poisson())
res_quan2 = glm2.fit()
print(res_quan2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Tour_count   No. Observations:                  421
Model:                            GLM   Df Residuals:                      409
Model Family:                 Poisson   Df Model:                           11
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -716.29
Date:                Sun, 24 Apr 2022   Deviance:                       28.572
Time:                        22:20:49   Pearson chi2:                     26.1
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.1092      0.200      0.545

In [147]:
res_quan2.aic

1456.58791248135

### data 3 = pattern 2

In [148]:
glm3 = smf.glm('Tour_count~gender + age + employment + time_mean + subway_ratio + duration_mean + work_ratio + HWH_count + HOH_count + H2H_count + Stop_mean', data3, family=sm.families.Poisson())
res_quan3 = glm3.fit()
print(res_quan3.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Tour_count   No. Observations:                  259
Model:                            GLM   Df Residuals:                      247
Model Family:                 Poisson   Df Model:                           11
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -477.04
Date:                Sun, 24 Apr 2022   Deviance:                       8.1746
Time:                        22:21:13   Pearson chi2:                     8.15
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.2717      0.310      0.877

In [45]:
res_quan3.aic

978.0845431256271

### data 5 = pattern 4

In [149]:
glm5= smf.glm('Tour_count~gender + age + employment + time_mean + subway_ratio + duration_mean + work_ratio + HWH_count + HOH_count + H2H_count + Stop_mean', data5, family=sm.families.Poisson())
res_quan5 = glm5.fit()
print(res_quan5.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             Tour_count   No. Observations:                  178
Model:                            GLM   Df Residuals:                      166
Model Family:                 Poisson   Df Model:                           11
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -225.91
Date:                Sun, 24 Apr 2022   Deviance:                       15.121
Time:                        22:21:22   Pearson chi2:                     12.3
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.6675      0.366     -1.826

In [150]:
res_quan5.aic

475.8139454713387

##  Ordered Probit model拟合出行链

In [137]:
data.head()

Unnamed: 0,pid,week,gender,age,employment,time_sum,time_mean,distance_sum,distance_mean,subway_ratio,duration_mean,work_ratio,HWH_count,HOH_count,H2H_dif_count,H2H_same_count,Stop_mean,Stop_count,Tour_count,H2H_count
0,90221,19,1,11,1,37722.0,1714.636364,162497.0,7386.227273,0.0,260.571429,0.857143,6,1,0,0,1.0,7,7,0
1,90221,20,1,11,1,30951.0,1719.5,60064.0,3336.888889,0.0,160.8,0.8,4,1,0,0,1.0,5,5,0
2,90221,21,1,11,1,11680.0,687.058824,118519.0,6971.705882,0.0,190.8,1.0,4,0,0,0,1.0,4,4,0
3,90221,22,1,11,1,22097.0,849.884615,154483.0,5941.653846,0.0,127.0,0.571429,4,3,2,0,1.222222,11,9,2
4,90221,24,1,11,1,3455.0,265.769231,28516.0,2193.538462,0.0,141.75,0.0,0,3,0,0,1.0,3,3,0


In [47]:
#cols = ['gender','age','employment','time_mean','subway_ratio','duration_mean','work_ratio','HWH_count','HOH_count','H2H_dif_count','H2H_same_count','Tour_count']
cols = ['gender','age','employment','time_mean','subway_ratio','work_ratio','duration_mean','HWH_count','HOH_count','H2H_count']
#cols = ['gender','age','employment','time_mean','subway_ratio','duration_mean','Tour_count']

### data0 = pattern 5

In [59]:
#cols = ['gender','age','employment','time_mean','subway_ratio','duration_mean','HWH_count','HOH_count','H2H_dif_count']
X = data0[cols]
y = data0['Stop_count']
prob_y = data0['Stop_count']/max(data0['Stop_count'])
category_y = pd.cut(y,bins=6,labels=[0,1,2,3,4,5])
#order_prob_y = category_y/max(category_y)

In [153]:
probit_mode0=sm.Probit(prob_y,X,missing='raise')
result0=probit_mode0.fit()
print(result0.summary())

         Current function value: 0.107478
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:             Stop_count   No. Observations:                   56
Model:                         Probit   Df Residuals:                       47
Method:                           MLE   Df Model:                            8
Date:                Sun, 24 Apr 2022   Pseudo R-squ.:                  0.7352
Time:                        22:21:34   Log-Likelihood:                -6.0188
converged:                      False   LL-Null:                       -22.726
Covariance Type:            nonrobust   LLR p-value:                 5.184e-05
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
gender           -8.7361      3.033     -2.880      0.004     -14.681      -2.792
age              -3.2893      0.819     -4.016      0.00



In [60]:
mod_prob0 = OrderedModel(category_y,
                        X,
                        distr='probit')

res_prob0 = mod_prob0.fit(method='bfgs')
res_prob0.summary()

Optimization terminated successfully.
         Current function value: 0.385239
         Iterations: 73
         Function evaluations: 79
         Gradient evaluations: 79


0,1,2,3
Dep. Variable:,Stop_count,Log-Likelihood:,-21.573
Model:,OrderedModel,AIC:,73.15
Method:,Maximum Likelihood,BIC:,103.5
Date:,"Mon, 25 Apr 2022",,
Time:,11:43:02,,
No. Observations:,56,,
Df Residuals:,41,,
Df Model:,15,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender,0.1098,0.596,0.184,0.854,-1.059,1.279
age,-0.0072,0.122,-0.059,0.953,-0.247,0.233
employment,0.1042,0.552,0.189,0.850,-0.978,1.187
time_mean,-8.231e-05,0.000,-0.325,0.746,-0.001,0.000
subway_ratio,1.2204,1.384,0.882,0.378,-1.493,3.934
work_ratio,0.6470,3.228,0.200,0.841,-5.679,6.973
duration_mean,0.0048,0.002,2.067,0.039,0.000,0.009
HWH_count,3.4561,1.020,3.387,0.001,1.456,5.456
HOH_count,2.5430,0.538,4.729,0.000,1.489,3.597


In [50]:
res_prob0.aic

73.14681157782364

### data 1 = pattern 1

In [40]:
X = data1[cols]
y = data1['Stop_count']
category_y = pd.cut(y,bins=6,labels=[0,1,2,3,4,5])
#stand_y = preprocessing.normalize(np.array(y).reshape(-1, 1),axis=0)

In [156]:
probit_model1=sm.Probit(y,X,missing='raise')
result1=probit_model1.fit()
print(result1.summary())

         Current function value: 0.361346
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:             Stop_count   No. Observations:                  402
Model:                         Probit   Df Residuals:                      393
Method:                           MLE   Df Model:                            8
Date:                Sun, 24 Apr 2022   Pseudo R-squ.:                 0.08796
Time:                        22:21:43   Log-Likelihood:                -145.26
converged:                      False   LL-Null:                       -159.27
Covariance Type:            nonrobust   LLR p-value:                 0.0004706
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
gender           -2.2854        nan        nan        nan         nan         nan
age               0.2380      0.032      7.408      0.00



In [41]:
mod_prob1 = OrderedModel(category_y,
                        X,
                        distr='probit')

res_prob1 = mod_prob1.fit(method='bfgs')
res_prob1.summary()

Optimization terminated successfully.
         Current function value: 0.994070
         Iterations: 57
         Function evaluations: 63
         Gradient evaluations: 63


0,1,2,3
Dep. Variable:,Stop_count,Log-Likelihood:,-399.62
Model:,OrderedModel,AIC:,829.2
Method:,Maximum Likelihood,BIC:,889.2
Date:,"Mon, 25 Apr 2022",,
Time:,11:29:08,,
No. Observations:,402,,
Df Residuals:,387,,
Df Model:,15,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender,-0.4121,0.135,-3.059,0.002,-0.676,-0.148
age,-0.0172,0.026,-0.650,0.516,-0.069,0.035
employment,1.5814,0.280,5.657,0.000,1.034,2.129
time_mean,-0.0002,7.09e-05,-3.136,0.002,-0.000,-8.34e-05
subway_ratio,0.1203,0.302,0.398,0.691,-0.472,0.713
work_ratio,-0.4509,0.633,-0.712,0.476,-1.692,0.790
duration_mean,-0.0054,0.001,-7.199,0.000,-0.007,-0.004
HWH_count,-0.1774,0.068,-2.603,0.009,-0.311,-0.044
HOH_count,0.0306,0.036,0.843,0.399,-0.041,0.102


### data 2 = pattern 3

In [170]:
#cols = ['gender','age','employment','time_mean','subway_ratio','duration_mean','Tour_count']
cols = ['gender','age','employment','time_mean','subway_ratio']

In [42]:
X = data2[cols]
y = data2['Stop_count']
category_y = pd.cut(y,bins=6,labels=[0,1,2,3,4,5])
#stand_y = preprocessing.normalize(np.array(categogy_y).reshape(-1, 1),axis=0)

In [172]:
probit_model2=sm.Probit(y,X,missing = 'raise')
result2=probit_model2.fit()
print(result2.summary())

         Current function value: 0.462636
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:             Stop_count   No. Observations:                  421
Model:                         Probit   Df Residuals:                      416
Method:                           MLE   Df Model:                            4
Date:                Sun, 24 Apr 2022   Pseudo R-squ.:                -0.08387
Time:                        22:22:36   Log-Likelihood:                -194.77
converged:                      False   LL-Null:                       -179.70
Covariance Type:            nonrobust   LLR p-value:                     1.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
gender          -6.2440      0.486    -12.861      0.000      -7.196      -5.292
age              0.4029      0.070      5.724      0.000   



In [43]:
mod_prob2 = OrderedModel(category_y,
                        X,
                        distr='probit')

res_prob2 = mod_prob2.fit(method='bfgs')
res_prob2.summary()

Optimization terminated successfully.
         Current function value: 1.169604
         Iterations: 51
         Function evaluations: 57
         Gradient evaluations: 57


0,1,2,3
Dep. Variable:,Stop_count,Log-Likelihood:,-492.4
Model:,OrderedModel,AIC:,1015.0
Method:,Maximum Likelihood,BIC:,1075.0
Date:,"Mon, 25 Apr 2022",,
Time:,11:29:47,,
No. Observations:,421,,
Df Residuals:,406,,
Df Model:,15,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender,-0.2211,0.122,-1.814,0.070,-0.460,0.018
age,0.0008,0.023,0.033,0.973,-0.045,0.046
employment,0.8506,0.161,5.286,0.000,0.535,1.166
time_mean,1.368e-05,3.72e-05,0.368,0.713,-5.92e-05,8.65e-05
subway_ratio,0.1939,0.327,0.592,0.554,-0.448,0.835
work_ratio,-0.0844,0.488,-0.173,0.863,-1.040,0.871
duration_mean,-0.0016,0.000,-3.736,0.000,-0.002,-0.001
HWH_count,0.2357,0.078,3.032,0.002,0.083,0.388
HOH_count,0.3400,0.043,7.854,0.000,0.255,0.425


### data 3 = pattern 2

In [164]:
cols = ['gender','age','employment','time_mean','subway_ratio','duration_mean','Tour_count']
#cols = ['gender','age','employment','time_mean','subway_ratio']

In [52]:
X = data3[cols]
y = data3['Stop_count']
category_y = pd.cut(y,bins=6,labels=[0,1,2,3,4,5])
#stand_y = preprocessing.normalize(np.array(categogy_y).reshape(-1, 1),axis=0)

In [166]:
probit_model3=sm.Probit(y,X,missing = 'raise')
result3=probit_model3.fit()
print(result3.summary())

         Current function value: 0.466534
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:             Stop_count   No. Observations:                  259
Model:                         Probit   Df Residuals:                      252
Method:                           MLE   Df Model:                            6
Date:                Sun, 24 Apr 2022   Pseudo R-squ.:                  0.3139
Time:                        22:22:15   Log-Likelihood:                -120.83
converged:                      False   LL-Null:                       -176.11
Covariance Type:            nonrobust   LLR p-value:                 1.557e-21
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
gender           -2.1790        nan        nan        nan         nan         nan
age              -1.6148      0.214     -7.560      0.00



In [53]:
mod_prob3 = OrderedModel(category_y,
                        X,
                        distr='probit')

res_prob3 = mod_prob3.fit(method='bfgs')
res_prob3.summary()

Optimization terminated successfully.
         Current function value: 1.234362
         Iterations: 50
         Function evaluations: 54
         Gradient evaluations: 54


0,1,2,3
Dep. Variable:,Stop_count,Log-Likelihood:,-319.7
Model:,OrderedModel,AIC:,669.4
Method:,Maximum Likelihood,BIC:,722.8
Date:,"Mon, 25 Apr 2022",,
Time:,11:31:34,,
No. Observations:,259,,
Df Residuals:,244,,
Df Model:,15,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender,-0.4328,0.156,-2.772,0.006,-0.739,-0.127
age,-0.0630,0.034,-1.841,0.066,-0.130,0.004
employment,1.5327,0.239,6.415,0.000,1.064,2.001
time_mean,-4.121e-06,6.46e-05,-0.064,0.949,-0.000,0.000
subway_ratio,0.3683,0.419,0.879,0.380,-0.453,1.190
work_ratio,0.2854,0.784,0.364,0.716,-1.250,1.821
duration_mean,-0.0043,0.001,-6.838,0.000,-0.006,-0.003
HWH_count,0.0673,0.094,0.719,0.472,-0.116,0.251
HOH_count,0.1526,0.061,2.491,0.013,0.033,0.273


### data 5 = pattern 4

In [55]:
X = data5[cols]
y = data5['Stop_count']
category_y = pd.cut(y,bins=6,labels=[0,1,2,3,4,5])
#stand_y = preprocessing.normalize(np.array(categogy_y).reshape(-1, 1),axis=0)

In [59]:
probit_model5=sm.Probit(y,X,missing = 'raise')
result5=probit_model5.fit()
print(result5.summary())

         Current function value: 0.107478
         Iterations: 35
                          Probit Regression Results                           
Dep. Variable:             Stop_count   No. Observations:                   56
Model:                         Probit   Df Residuals:                       47
Method:                           MLE   Df Model:                            8
Date:                Sun, 24 Apr 2022   Pseudo R-squ.:                  0.7352
Time:                        11:41:55   Log-Likelihood:                -6.0188
converged:                      False   LL-Null:                       -22.726
Covariance Type:            nonrobust   LLR p-value:                 5.184e-05
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
gender           -8.7361      3.033     -2.880      0.004     -14.681      -2.792
age              -3.2893      0.819     -4.016      0.00



In [56]:
mod_prob5 = OrderedModel(category_y,
                        X,
                        distr='probit')

res_prob5 = mod_prob5.fit(method='bfgs')
res_prob5.summary()

Optimization terminated successfully.
         Current function value: 1.173056
         Iterations: 53
         Function evaluations: 60
         Gradient evaluations: 60


0,1,2,3
Dep. Variable:,Stop_count,Log-Likelihood:,-208.8
Model:,OrderedModel,AIC:,447.6
Method:,Maximum Likelihood,BIC:,495.3
Date:,"Mon, 25 Apr 2022",,
Time:,11:39:49,,
No. Observations:,178,,
Df Residuals:,163,,
Df Model:,15,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
gender,-0.1353,0.200,-0.677,0.498,-0.527,0.256
age,-0.0404,0.036,-1.131,0.258,-0.110,0.030
employment,0.4858,0.204,2.383,0.017,0.086,0.885
time_mean,-4.834e-05,4.39e-05,-1.102,0.270,-0.000,3.76e-05
subway_ratio,-0.0990,0.452,-0.219,0.827,-0.985,0.787
work_ratio,0.0393,0.601,0.065,0.948,-1.138,1.217
duration_mean,-0.0008,0.000,-2.028,0.043,-0.002,-2.71e-05
HWH_count,0.7135,0.281,2.537,0.011,0.162,1.265
HOH_count,0.8188,0.110,7.468,0.000,0.604,1.034


## MNLogit

In [36]:
iris = sm.datasets.get_rdataset('iris', 'datasets')
y = iris.data.Species
pd.value_counts(y)

virginica     50
setosa        50
versicolor    50
Name: Species, dtype: int64

In [40]:
x = iris.data.iloc[:, 0]
x = sm.add_constant(x, prepend = False)
x

Unnamed: 0,Sepal.Length,const
0,5.1,1.0
1,4.9,1.0
2,4.7,1.0
3,4.6,1.0
4,5.0,1.0
...,...,...
145,6.7,1.0
146,6.3,1.0
147,6.5,1.0
148,6.2,1.0


In [2]:
#导入数据
data = pd.read_csv('data/vaccinum_survey.csv')
data.head()

Unnamed: 0,x11,x12,x13,x14,x15,y17,y18,y19
0,1,2,1,4,2,0,0,2
1,3,2,1,3,4,0,0,1
2,1,4,2,1,2,1,1,0
3,1,1,3,3,3,0,0,2
4,1,4,2,3,4,1,1,0


In [86]:
features = ['x11','x12','x13','x14','x15']
#X = pd.get_dummies(data[features],columns=features,prefix=features)
X = data[features]
y17 = data['y17'].astype(np.object)
y18 = data['y18'].astype(np.object)
y19 = data['y19'].astype(np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y17 = data['y17'].astype(np.object)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y18 = data['y18'].astype(np.object)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y19 = data['y19'].astype(np.object)


In [87]:
#X = sm.add_constant(X,prepend = False)

In [88]:
# y17
mnlogit_mod17 = sm.MNLogit(y17, X)
mnlogit_fit17 = mnlogit_mod17.fit_regularized()

print (mnlogit_fit17.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.8618784052006642
            Iterations: 111
            Function evaluations: 113
            Gradient evaluations: 111
                          MNLogit Regression Results                          
Dep. Variable:                    y17   No. Observations:                  314
Model:                        MNLogit   Df Residuals:                      299
Method:                           MLE   Df Model:                           12
Date:                Wed, 20 Apr 2022   Pseudo R-squ.:                  0.1064
Time:                        21:23:39   Log-Likelihood:                -270.63
converged:                       True   LL-Null:                       -302.85
Covariance Type:            nonrobust   LLR p-value:                 3.455e-09
     y17=1       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

In [89]:
# y18
mnlogit_mod18 = sm.MNLogit(y18, X)
mnlogit_fit18 = mnlogit_mod18.fit_regularized()

print (mnlogit_fit18.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.8208713611029315
            Iterations: 114
            Function evaluations: 115
            Gradient evaluations: 114
                          MNLogit Regression Results                          
Dep. Variable:                    y18   No. Observations:                  314
Model:                        MNLogit   Df Residuals:                      294
Method:                           MLE   Df Model:                           16
Date:                Wed, 20 Apr 2022   Pseudo R-squ.:                  0.1051
Time:                        21:24:05   Log-Likelihood:                -257.75
converged:                       True   LL-Null:                       -288.04
Covariance Type:            nonrobust   LLR p-value:                 4.204e-07
     y18=1       coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------