In [None]:
#Import libraries
import os, json, time, pandas_profiling, warnings
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
from datetime import date, datetime
import calendar
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

#from docx import Document
#from docx.shared import Inches

#from mlxtend.frequent_patterns import apriori
#from mlxtend.frequent_patterns import association_rules

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style"))

pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings('ignore')


%matplotlib inline
pd.set_option('display.max_columns', 500)
#distance plot - titles in plots
rcParams['axes.titlepad'] = 45
rcParams['font.size'] = 16


In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Settings - possible values:
# complete
# merged
# no-outliers
# merged-no-outliers
# merged-no-outliers_quant_002
analysis = 'merged-no-outliers_quant_001'   



# if analysis == 'complete':
#     legs = 'all_legs_final_ds_user_info.pkl'
#     img_path = 'img/'
#     report_name = 'Results_01.05_15.10.docx'

# elif analysis == 'merged':
#     legs = 'all_legs_merged_1.pkl'
#     img_path = 'img_merged/'
#     report_name = 'Results_01.05_15.10_merged.docx'
    
# # elif analysis == 'no-outliers':    
# #     legs = 'all_legs_final_ds_user_info_no_outlier.pkl'
# #     img_path = 'img_nooutliers/'
# #     report_name = 'Results_01.05_30.07_nooutliers.docx'
    
# elif analysis == 'merged-no-outliers_quant_001':    
legs = 'all_legs_merged_no_outlier_0.01.pkl'
img_path = 'img_merged_nooutliers/'
#report_name = 'Results_01.05_15.10_merged_nooutliers_0.01.docx'
    
# elif analysis == 'merged-no-outliers_quant_002':    
#     legs = 'all_legs_merged_no_outlier_quant_002.pkl'
#     img_path = 'img_merged-no-outliers_quant_002/'
#     report_name = 'Results_01.05_30.07_merged-no-outliers_quant_002.docx'


if not os.path.exists(img_path):
    os.makedirs(img_path)
    

#Global variables
cutting_date = '2019-05-01' # remove trips and data published before this date
meta_data_path = '../../data-campaigns/meta-data/'
input_path = '../../out_2019.10.15/'
report_path = '../reports/'

### Read data

- `all_legs_final_ds_user_info`: all data about trips, legs and users
- `trips_users_df`: match trip-user with date info
- `trips_df`: original df with trip info
- `values_from_trip`: for each leg the values for Productivity (paid work + personal tasks), Enjoyment, Fitness

In [None]:
#read pre-processed datasets
all_legs_final_ds_user_info = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
# users_df_with_trips = pd.read_pickle(out_path + 'pre-processed_ds/users_df_with_trips.pkl')
trips_df = pd.read_pickle(input_path+'trips_df_geoinfo.pkl')
values_from_trip= pd.read_pickle(input_path + 'values_from_trip.pkl')
print(values_from_trip.shape)
values_from_trip.head()

### Preprocessing on `values_from_trip`

In [None]:
# Available categories ['Paid_work', 'Personal_tasks', 'Enjoyment', 'Fitness', 'Unknown']
# remove unknown from the categories 
tmp0 = values_from_trip[values_from_trip.valueFromTrip != 'Unknown']

### Create a new df with this structure:
# legid, Enjoyment, Fitness, Paid_work, Personal_tasks, wastedTime, Productivity

# select only column we need
tmp = tmp0[['legid', 'valueFromTrip', 'value']]
# create pivot table with this columns: legid, E, F, Pw, Pt
tmp2 = tmp.pivot(index='legid', columns='valueFromTrip', values= 'value').reset_index()
# add also WT column 
tmp3 = pd.merge(tmp2, all_legs_final_ds_user_info[['legid', 'wastedTime']], on='legid', how='left')
# remove rows with NAN in WT
tmp4 = tmp3[tmp3.wastedTime.notna()]
# select values of WT in [1,5]
tmp5 = tmp4[tmp4.wastedTime.between(1,5)]
# convert WT in numeric variable and make all values int
tmp5.wastedTime = pd.to_numeric(tmp5.wastedTime)
tmp5.wastedTime = np.round(tmp5.wastedTime)

# merge Paid_work and Personal_tasks into Productivity
# (!!) considering the MAXIMUM value
tmp5['Productivity'] =tmp5[['Paid_work', 'Personal_tasks']].max(axis=1)

values_from_trip2 = tmp5.copy()
print('Final shape:', values_from_trip2.shape)
values_from_trip2.head()

In [None]:
# save
values_from_trip2.to_csv('values_from_trip2.csv')


In [None]:
test= values_from_trip2[(values_from_trip2['Enjoyment']==0)&
                 (values_from_trip2['Fitness']==0)&
                 (values_from_trip2['Productivity']==0)].groupby('wastedTime').size().reset_index()
test.columns = ['wastedTime','#leg000']
test

In [None]:
import mord
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge


X = values_from_trip2[['Enjoyment', 'Fitness', 'Productivity']]
y = values_from_trip2['wastedTime']


mul_lr = mord.OrdinalRidge(alpha=0.001,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto').fit(X, y)

mul_lr.coef_
values_from_trip2['pred'] = mul_lr.predict(X)
values_from_trip2[values_from_trip2['wastedTime'] == 1].head(10)
values_from_trip2['pred'].unique()

## Correlation and Association analysis

1. Distribution of all the variables
2. Conditional distribution of PEF wrt WT
3. Average of WT wrt PEF sum
4. Chi-squared association and Cramer's V - each of PEF wrt WT
5. Comparison on average WT versus PEF

***Distribution of all the variables***

In [None]:
## Distribution of Wasted Time variable - relative and absolute frequencies
tmp = pd.DataFrame(values_from_trip2.wastedTime.value_counts())
tmp['rel_wastedTime'] = values_from_trip2.wastedTime.value_counts()/len(values_from_trip2)
tmp

In [None]:
# ## General distributions of variables
# from matplotlib import rcParams
# rcParams['axes.titlepad'] =5

# fig, axs = plt.subplots(2,3, figsize=(15,7))
# plt.subplots_adjust(top=1)

# for idx,ax in list(enumerate(axs.flat)):
#     print(idx)
#     col_name = list(values_from_trip2.columns)[idx+1]
# col_name

In [None]:
## General distributions of variables
from matplotlib import rcParams
rcParams['axes.titlepad'] =5

fig, axs = plt.subplots(2,3, figsize=(15,7))
plt.subplots_adjust(top=1)

for idx,ax in list(enumerate(axs.flat)):
    col_name = list(values_from_trip2.columns)[idx+1]
    weights = np.zeros_like(values_from_trip2.iloc[:,idx+1]) + 1. / len(values_from_trip2.iloc[:,idx+1])
    ax.hist(values_from_trip2.iloc[:,idx+1], weights= weights)
    ax.set_title(col_name)

    ax.set_xticks(range(len(values_from_trip2.iloc[:,idx+1].unique())))
    if col_name == 'wastedTime':
        ax.set_xticks(range(1, len(values_from_trip2.iloc[:,idx+1].unique())+1))
        ax.set_xlim(left=1)

***Conditional distribution of PEF wrt WT***

In [None]:
cond_plot = sns.FacetGrid(data=values_from_trip2, col='wastedTime', sharey=False) #, hue='CentralAir', col_wrap=4)
cond_plot.map(plt.hist, 'Enjoyment');

In [None]:
cond_plot = sns.FacetGrid(data=values_from_trip2, col='wastedTime', sharey=False)
cond_plot.map(plt.hist, 'Fitness');

In [None]:
cond_plot = sns.FacetGrid(data=values_from_trip2, col='wastedTime', sharey=False)
cond_plot.map(plt.hist, 'Productivity');

***Average of WT wrt PEF sum***

In [None]:
# add the sum
values_from_trip2['PEF'] = values_from_trip2[['Enjoyment', 'Fitness', 'Productivity']].sum(axis=1)
# select only columns we need, group by PEF sum and make the mean of WT
pef_sum = values_from_trip2[['legid', 'PEF', 'wastedTime']].groupby('PEF').mean()
pef_sum

### Interpretation: legs with sum of Enjoyment, Fitness and Productivity equal to 0 
# have 3 as wastedTime *on average*.

***Chi-squared association and Cramer's V***

Evaluate the association between:
- Enjoyment and wastedTime
- Fitness and wastedTime
- Productivity and wastedTime

Ref: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

Cramer's V:
- 0: no association
- 1: complete association

In [None]:
from scipy.stats import chi2_contingency

def cramer_v(tab):
    chi2 = chi2_contingency(tab)[0]
    n = sum(tab.sum())
    phi2 = chi2/n
    r,k = tab.shape

    return(np.sqrt(phi2 / min( (k-1), (r-1))))

CV_enj = cramer_v(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Enjoyment))
CV_fit = cramer_v(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Fitness))
CV_pro = cramer_v(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Productivity))
print("Cramer's V")
print('E:', CV_enj, ' - F:', CV_fit, ' - P:', CV_pro)
print()

print('chi squared test')
print('E:', chi2_contingency(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Enjoyment))[1], 
      ' - F:', chi2_contingency(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Fitness))[1], 
      ' - P:', chi2_contingency(pd.crosstab(values_from_trip2.wastedTime, values_from_trip2.Productivity))[1])

### Interpretation:
# There is 30% of association between Enjoyment and wastedTime 

## Chi-Squared
# H0: distributions are significantly different
# H1: distributions are not significantly different
# with the chi squared test we have to reject the null hypothesis
# distributions are not significantly different

***Comparison on average WT versus PEF***

In [None]:
values_from_trip2.pivot_table(index='wastedTime',  values='Enjoyment', aggfunc='mean')
# legs with wastedTime equal to 1,2 have *on average* 0 for Enjoyment
# legs with wastedTime equal to 3,4,5 have *on average* 1 for Enjoyment 

In [None]:
values_from_trip2.pivot_table(index='wastedTime',  values='Fitness', aggfunc='mean')
# legs with wastedTime equal to 1,2,3 have *on average* 0 for Fitness
# legs with wastedTime equal to 4,5 have *on average* 1 for Fitness 

In [None]:
np.round(values_from_trip2.pivot_table(index='wastedTime',  values='Productivity', aggfunc='mean'))
# legs with wastedTime equal to 1,2 have *on average* 0 for Productivity
# legs with wastedTime equal to 3,4,5 have *on average* 1 for Productivity 

In [None]:
values_from_trip2[['Enjoyment', 'Fitness', 'Productivity', 'wastedTime']].groupby('wastedTime').mean()
# legs with wastedTime equal to 1 have *on average* 0 for PEF
# legs with wastedTime equal to 2 have *on average* 0 for PEF
# legs with wastedTime equal to 3 have *on average* 0 for F and 1 for PE
# legs with wastedTime equal to 4 have *on average* 1 for PEF
# legs with wastedTime equal to 5 have *on average* 1 for PEF

### Example: Walking dataset

Considering only legs with `transp_category` equal to `walking` 

In [None]:
transp_cat = 'Walking'
x = all_legs_final_ds_user_info[['legid', 'transp_category']]
trasnp = pd.merge(values_from_trip2, x, on='legid', how='left')
print(trasnp.transp_category.unique())
trasnp = trasnp[trasnp.transp_category == transp_cat]
trasnp.head(3)

In [None]:
df = trasnp[['Enjoyment', 'Fitness', 'Productivity', 'wastedTime']].melt('wastedTime', var_name='element', value_name='Val')
df.head()

In [None]:
df1 = df.groupby(['wastedTime','element','Val']).size().reset_index()
df1.columns = ['wastedTime','element','Val','freq']
df1.head()

In [None]:

fig, axs = plt.subplots(1,5, figsize=(15,7))
# plt.subplots_adjust(top=1)

for idx,ax in list(enumerate(axs.flat)): 
    plt.subplot(1, 5, idx+1)
    ax = plt.gca()

    sns.barplot(data = df1[df1['wastedTime']==idx+1], x="element", y='freq', hue='Val').set(
    xlabel='wastedTime', 
    ylabel = 'Freq' )
    
    plt.title('WastedTime ' + str(idx+1), y=1.)
    

plt.tight_layout()

    

In [None]:
df1[df1['wastedTime']==1]

In [None]:
# cond_plot = sns.FacetGrid(data=df1, col='wastedTime', hue='element', sharey=False) #, hue='CentralAir', col_wrap=4)
# cond_plot.map(sns.barplot,  "Val", 'freq').add_legend()

In [None]:
# cond_plot = sns.FacetGrid(data=trasnp, col='wastedTime', sharey=False) #, hue='CentralAir', col_wrap=4)
# cond_plot.map(plt.hist, 'Fitness');

In [None]:
# cond_plot = sns.FacetGrid(data=trasnp, col='wastedTime', sharey=False) #, hue='CentralAir', col_wrap=4)
# cond_plot.map(plt.hist, 'Productivity');

In [None]:
trasnp[['Enjoyment', 'Fitness', 'Productivity', 'wastedTime']].groupby('wastedTime').mean()
# legs with wastedTime equal to 1 have *on average* 0 for PEF

In [None]:
y

In [None]:
import mord
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge


X = trasnp[['Enjoyment', 'Fitness', 'Productivity']]
y = trasnp['wastedTime']


mul_lr = mord.OrdinalRidge(alpha=0.001,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    max_iter=None,
    tol=0.001,
    solver='auto').fit(X, y)

print('Coeficinets: ', mul_lr.coef_)
trasnp['pred'] = mul_lr.predict(X)
trasnp[trasnp['wastedTime'] == 1].head(10)
trasnp['pred'].unique()

In [None]:
x = all_legs_final_ds_user_info[['legid', 'transp_category']]
df_0 = pd.merge(values_from_trip2, x, on='legid', how='left')
df_0.head()

In [None]:
df_0 = df_0[(df_0['Enjoyment'] == 0) & (df_0['Fitness'] == 0) & (df_0['Productivity'] == 0)]
df_0.head()

In [None]:
df_0.groupby('wastedTime').size()