# Characterizing self-reported navigation ability using demographic information

##Ordinal Logistic Regression

### read data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
cd /content/drive/My Drive/Data/

/content/drive/.shortcut-targets-by-id/1pS1W_QFv_TreubUVne2-kLNgsBSGwhhw/Data


In [3]:
df_raw = pd.read_csv(r'2019user_cleaned.csv')

In [4]:
df = df_raw.copy()[['age','education','gender','hand','home_environment','navigating_skills','sleep','travel_time']]

In [5]:
df_model = df.copy()
recode_map = {"gender": {'f': 0,'m': 1},
                   "hand": {'right': 0,'left': 1},
                   "education":{"high-school":0, "no-formal":0,"college":1,"university":1},
                   "travel_time":{"less-30-mins":0, "30-mins-to-1-hour":1, "hour-plus":2},
                   "home_environment":{"rural":0,"suburbs":1,"mixed":1,"city":2}}
df_model = df_model.replace(recode_map)

In [6]:
df_model.head()

Unnamed: 0,age,education,gender,hand,home_environment,navigating_skills,sleep,travel_time
0,68.0,1,0,0,2,bad,8.0,0
1,18.0,0,1,1,1,good,7.0,0
2,18.0,0,1,1,2,very-good,7.0,2
3,21.0,1,0,0,1,good,7.0,1
4,36.0,1,1,0,0,very-good,5.0,1


In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_model[['age','sleep','home_environment','travel_time']] = sc.fit_transform(df_model[['age','sleep','home_environment','travel_time']])

In [8]:
df_model['navigating_skills'] = pd.Categorical(df_model['navigating_skills'], 
                                 categories=['very-bad','bad','good','very-good'], 
                                 ordered=True)

In [9]:
df_model.head()

Unnamed: 0,age,education,gender,hand,home_environment,navigating_skills,sleep,travel_time
0,2.123644,1,0,0,1.263252,bad,0.676731,-1.086344
1,-1.145045,0,1,1,-0.205049,good,-0.072897,-1.086344
2,-1.145045,0,1,1,1.263252,very-good,-0.072897,1.437202
3,-0.948923,1,0,0,-0.205049,good,-0.072897,0.175429
4,0.031683,1,1,0,-1.67335,very-good,-1.572153,0.175429


# Ordinal Regression Model


**[Ordinal Regression](https://www.statsmodels.org/devel/generated/statsmodels.miscmodels.ordinal_model.OrderedModel.html)**

[Sample Code](https://www.statsmodels.org/devel/examples/notebooks/generated/ordinal_regression.html)

[Logistic Regression](https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/)

[Probit Regression](https://stats.idre.ucla.edu/stata/dae/probit-regression/)

In [10]:
#!pip3 install git+https://github.com/statsmodels/statsmodels

In [11]:
import scipy.stats as stats

from statsmodels.miscmodels.ordinal_model import OrderedModel

In [12]:
# age & gender interaction
age_gender_logit = OrderedModel.from_formula("navigating_skills ~ age*gender", df_model,
                                      distr='logit')
res_a_g_logit = age_gender_logit.fit(method='bfgs')
print(res_a_g_logit.summary())

Optimization terminated successfully.
         Current function value: 0.984056
         Iterations: 21
         Function evaluations: 22
         Gradient evaluations: 22
                             OrderedModel Results                             
Dep. Variable:      navigating_skills   Log-Likelihood:            -8.9496e+05
Model:                   OrderedModel   AIC:                         1.790e+06
Method:            Maximum Likelihood   BIC:                         1.790e+06
Date:                Thu, 21 Oct 2021                                         
Time:                        18:02:39                                         
No. Observations:              909456                                         
Df Residuals:                  909450                                         
Df Model:                           6                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------

In [13]:
age_gender_logit_al = OrderedModel.from_formula("navigating_skills ~ age*gender+hand+sleep+education+travel_time+home_environment", df_model,
                                      distr='logit')
res_a_g_logit_al = age_gender_logit_al.fit(method='bfgs')
print(res_a_g_logit_al.summary())

Optimization terminated successfully.
         Current function value: 0.978821
         Iterations: 34
         Function evaluations: 35
         Gradient evaluations: 35
                             OrderedModel Results                             
Dep. Variable:      navigating_skills   Log-Likelihood:            -8.9019e+05
Model:                   OrderedModel   AIC:                         1.780e+06
Method:            Maximum Likelihood   BIC:                         1.781e+06
Date:                Thu, 21 Oct 2021                                         
Time:                        18:06:03                                         
No. Observations:              909456                                         
Df Residuals:                  909445                                         
Df Model:                          11                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------

### Individual-Level Predication

In [14]:
res_a_g_logit_al.params

age                 0.024392
gender              0.899292
age:gender          0.148210
hand               -0.020951
sleep               0.066574
education           0.129803
travel_time         0.186393
home_environment   -0.030488
very-bad/bad       -3.465974
bad/good            0.712730
good/very-good      1.040188
dtype: float64

In [15]:
test = df_model.drop(columns=['navigating_skills'])
test['age:gender']=test['age']*test['gender']
ynewpred = res_a_g_logit_al.model.predict(res_a_g_logit_al.params,exog=test[['age','gender','age:gender','hand','sleep','education','travel_time','home_environment']],which="prob") 

  xb = xb[:, None]


In [16]:
y_class_raw=[]
for i in range(ynewpred.shape[0]):
  tmp_class = np.argmax(ynewpred[i])
  y_class_raw.append(tmp_class)

In [17]:
nav_map = {"very-bad":0, "bad":1,"good":2,"very-good":3}
df_model['nav_level'] = df_model['navigating_skills'].replace(nav_map)

In [18]:
true_class = df_model['nav_level'].values.tolist()

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(true_class, y_class_raw)

array([[     0,      0,  17731,    425],
       [     0,      0,  99060,   2881],
       [     0,      0, 481638,  26518],
       [     0,      0, 252931,  28272]])

In [21]:
(481638+28272)/df_model.shape[0]

0.5606758325856336