Objective: Expand NPS Predictive Model based on Flight Control/Treatment pairs to newer datasets and also to more apps than just Excel.

In [1]:
import math
import pandas as pd
import numpy as np
import datetime
import sklearn.tree
from graphviz import Source
from IPython.display import SVG
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
import os

from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

def transform_rating(rating):
    '''Input: Column of Data with NPS Field (on a scale of 1 to 5...)
    Output: Column with either Promoter/Detractor/Passive label, or the corresponding weights, based on datatype'''
    
    if rating == 5:
        return 100
    elif rating <= 3:
        return -100
    elif rating == 4:
        return 0
    else:
        return np.nan

In [2]:
Word_df = pd.read_csv('Word_updated_dataOct.tsv', sep='\t')

In [3]:
Word_df.head()

Unnamed: 0,OcvId,ProcessSessionId,Date,LongDate,Platform,Product,Rating,SurveyRatingScale,OriginalText,Verbatim,...,Skus,rn,FlightId,WacSessionId,WACSessionID,Application,ApplicationMode,ApplicationLCID,Host,rn1
0,flnps_v2_dda9bf5625245a989e34b176c130e96c,477f17f3-a06d-4d50-8a33-8d9ca38206f4,10/15/2019 2:40:08 AM,2019-10-15,Web,Word,5,5,,,...,,1,canary-docowner-t,477f17f3-a06d-4d50-8a33-8d9ca38206f4,477f17f3-a06d-4d50-8a33-8d9ca38206f4,Word,Unified,en-US,SharePoint Online,1
1,flnps_v2_d82877cc43795053a9dd48af73202391,481f12ae-04b2-4417-a35f-e2a4a08240de,7/30/2019 2:41:00 AM,2019-07-30,Web,Word,4,5,Because i like it,Because i like it,...,,1,NoFlight,481f12ae-04b2-4417-a35f-e2a4a08240de,481f12ae-04b2-4417-a35f-e2a4a08240de,Word,Unified,en-GB,SharePoint Online,1
2,flnps_v2_6b9d9aeb09665579b885b966ef0383c1,4fdd7235-92de-4d5f-9e0c-65a65410d534,10/28/2019 1:40:28 AM,2019-10-28,Web,Word,4,5,,,...,,1,canary-docowner-t,4fdd7235-92de-4d5f-9e0c-65a65410d534,4fdd7235-92de-4d5f-9e0c-65a65410d534,Word,Unified,en-US,OneDriveWOPI,1
3,flnps_v2_86fe37562d1159bea1ead15901571d0c,20aead01-99af-400d-a7ee-50739fa4ddbe,9/23/2019 10:49:34 AM,2019-09-23,Web,Word,5,5,,,...,,1,canary-docowner-t,20aead01-99af-400d-a7ee-50739fa4ddbe,20aead01-99af-400d-a7ee-50739fa4ddbe,Word,Unified,en-US,SharePoint Online,1
4,flnps_v2_7edad9a9c9155ef7abb8b10f5852926c,012a8457-ca90-4c2c-a4cc-d96e61f52c1f,10/15/2019 12:16:01 PM,2019-10-15,Web,Word,5,5,by having autocorrect please],by having autocorrect please],...,,1,NoFlight,012a8457-ca90-4c2c-a4cc-d96e61f52c1f,012a8457-ca90-4c2c-a4cc-d96e61f52c1f,Word,Unified,en-US,SharePoint Online,1


In [4]:
Excel_df = pd.read_csv('Excel_updated_dataOct.tsv', sep='\t')
PP_df = pd.read_csv('PowerPoint_updated_dataOct.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
PP_df.head()

Unnamed: 0,OcvId,ProcessSessionId,Date,LongDate,Platform,Product,Rating,SurveyRatingScale,OriginalText,Verbatim,...,Skus,rn,FlightId,WacSessionId,WACSessionID,Application,ApplicationMode,ApplicationLCID,Host,rn1
0,flnps_v2_b59dd6b09c805661bc7face573961b82,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,10/27/2019 2:04:04 PM,2019-10-27,Web,PowerPoint,5,5,"Facilita a edição, é prático e oferece bons de...","Facilitates editing, is practical and offers g...",...,,1,canary-c,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,PowerPoint,Unified,pt-BR,OneDriveWOPI,1
1,flnps_v2_b3397471bc1158b9b359291a1da871ab,481cce98-912d-66ac-366b-81cb3731d61a,6/4/2019 1:34:26 PM,2019-06-04,Web,PowerPoint,3,5,,,...,,1,canary-c,481cce98-912d-66ac-366b-81cb3731d61a,481cce98-912d-66ac-366b-81cb3731d61a,PowerPoint,Unified,en-US,SharePoint Online,1
2,flnps_v2_cb448186b6ad52809d8199a77b9772ae,4fd742a2-1196-486b-9518-68b8530a1fe4,5/4/2019 8:10:28 PM,2019-05-04,Web,PowerPoint,5,5,,,...,,1,canary-t,4fd742a2-1196-486b-9518-68b8530a1fe4,4fd742a2-1196-486b-9518-68b8530a1fe4,PowerPoint,Unified,pt-BR,OneDriveWOPI,1
3,flnps_v2_6722df959f745987b2718d2ae4b17606,20aeb498-8576-4915-a6e1-fa4697445d3c,5/31/2019 7:55:06 PM,2019-05-31,Web,PowerPoint,5,5,,,...,,1,canary-t,20aeb498-8576-4915-a6e1-fa4697445d3c,20aeb498-8576-4915-a6e1-fa4697445d3c,PowerPoint,Unified,en-US,SharePoint Online,1
4,flnps_v2_5eeca61dece15306ac756297f59dccbe,01339fec-c71d-4cd9-a486-22a0c14c5a49,7/1/2019 2:25:29 PM,2019-07-01,Web,PowerPoint,3,5,,,...,,1,afd_ignorewaccluster,01339fec-c71d-4cd9-a486-22a0c14c5a49,01339fec-c71d-4cd9-a486-22a0c14c5a49,PowerPoint,Unified,en-US,OneDriveWOPI,1


In [6]:
def make_AB_dataset(df,prefixes):
    '''df: initial input dataframe of flight data
    prefixes: flights have prefixes - this arg is a list for which prefixes we want to filter on'''
    df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    #df['FlightPair'].replace('docowner-canary','canary-docowner',inplace=True)
    
    
    ab_df = df[df.FlightId.notnull()]
    ab_df.drop_duplicates(keep='last', inplace=True)
    print(ab_df.shape, ' before filtering out non-pairs')
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    print(ab_df.shape, ' after filtering out non-pairs')
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Flight'] = 1
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Flight'] = 1
    ab_df = ab_df[ab_df.Flight.notnull()]
    ab_df['NPS'] = ab_df['Rating'].apply(transform_rating)
    ab_df = ab_df.sort_values(by='Date')
    value_key = ab_df.sort_values(by='Date').groupby(['OcvId'])['NPS'].last().to_dict()
    exp_df = ab_df.groupby(['OcvId','FlightPair'])['Flight'].last().unstack()
    print('Feature Matrix should have ',ab_df.OcvId.nunique(), ' rows and ',ab_df.FlightPair.nunique(),' columns')
    print('Final Shape:',exp_df.shape)
    if prefixes: #i.e. if the input list is empty:
        for p in prefixes:
            exp_df = exp_df.iloc[:,exp_df.columns.str.startswith(p)]
    exp_df['NPS'] = exp_df.index.map(value_key)
    return exp_df.fillna(0)

#excel_df = make_AB_dataset(Excel_df,['xls'])

In [7]:
####### JUST READ THE CSV IF YOUVE RAN ALREADY #######

def get_flight_durations():
    ''' no inputs/arguments, just make sure you have all of the Tabular Flight data files you are using.'''
    df = pd.concat([Excel_df[Excel_df['AudienceGroup']=='Production'][Excel_df.FlightId.notnull()],
                    Word_df[Word_df['AudienceGroup']=='Production'][Word_df.FlightId.notnull()],
                    PP_df[PP_df['AudienceGroup']=='Production'][PP_df.FlightId.notnull()]])
    #df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    
    print('Flight Pairs Assigned')
    
    ab_df = df[df.FlightPair.notnull()]
    #ab_df.drop_duplicates(keep='last', inplace=True)
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    flight_starts = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].first()
    flight_ends = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].last()
    flight_durations = pd.concat([flight_starts,flight_ends],axis=1)
    flight_durations.columns = ['FlightStart','FlightEnd']
    return flight_durations
    
flight_durations = get_flight_durations()

Flight Pairs Assigned


In [7]:
flight_durations.to_csv('FlightsOctober.csv')


In [8]:
excel_df = make_AB_dataset(Excel_df,[])
word_df = make_AB_dataset(Word_df,[])
pp_df = make_AB_dataset(PP_df,[])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(875085, 40)  before filtering out non-pairs
(625273, 40)  after filtering out non-pairs
Feature Matrix should have  23319  rows and  145  columns
Final Shape: (23319, 145)
(1497298, 40)  before filtering out non-pairs
(1075637, 40)  after filtering out non-pairs
Feature Matrix should have  25139  rows and  240  columns
Final Shape: (25139, 240)
(428103, 40)  before filtering out non-pairs
(242723, 40)  after filtering out non-pairs
Feature Matrix should have  18610  rows and  62  columns
Final Shape: (18610, 62)


In [9]:
excel_flights = list(excel_df.columns)
word_flights = list(word_df.columns)
common_flights = list(set(excel_flights).intersection(word_flights))
pp_flights = list(pp_df.columns)
common_flights = list(set(common_flights).intersection(pp_flights))

In [10]:
common_flights.remove('NPS')

In [11]:
common_flights

['canary-',
 'wac-wordimagegroupcontainerfittopage-',
 'wac-box4secureuuid-',
 'wacdash-',
 'wac-wordclearformattingtrackchanges-',
 'wac-owaunifiedapp-',
 'wac-wordcloneatmentionpropertiesinhyperlink-',
 'wac-wordwordoauthtestappaadtokenretrieval-',
 'wac-mergemissingeopfix-',
 'wac-licensingisenabled-',
 'wac-worduseaadforfileupload-',
 'wac-wordeditorcashdashisactivecoauth-',
 'docowneridtestaa-',
 'xls-customsortforallrangetypes-',
 'xls-removescrolltofirstandlastsheettabnavbuttons-',
 'wac-wordsdxpreloadpackagelist-',
 'wac-minimizeintelligentplaceholderwork-',
 'xls-overridebrowsershortcuts-',
 'wac-wordpreserveverbonredirect-',
 'wac-worddefinenewbullettablecellclassname-',
 'docowner-canary-',
 'wac-oauthtestappmsatokenretrieval-',
 'firstrelease-',
 'wac-box4directprinthidesensitiveparameters-',
 'wac-oauthmsatokenretrieval-',
 'canary-user-',
 'wac-worduseoauthfullyfordlp-',
 'canary-docowner-',
 'canary2-',
 'wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-',
 'us

In [12]:
dc = ['docowner-canary-','canary-docowner-',
     'canary2','canary-']

In [13]:
X = {'excel':excel_df.iloc[:,excel_df.columns.str.startswith('xls')],
     'word':word_df.iloc[:,word_df.columns.str.startswith('wac')],
     'pp':pp_df.iloc[:,pp_df.columns.str.startswith('pp')],
     'all':pd.concat([excel_df,word_df,pp_df]).loc[:,common_flights]}
y = {'excel':excel_df['NPS'].replace([100,0],1).replace(-100,0)
     ,'word':word_df['NPS'].replace([100,0],1).replace(-100,0),
     'pp':pp_df['NPS'].replace([100,0],1).replace(-100,0),
     'all':pd.concat([excel_df,word_df,pp_df])['NPS'].replace([100,0],1).replace(-100,0)}

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [14]:
print(X['excel'].shape, y['excel'].shape)
print(X['word'].shape, y['word'].shape)
print(X['pp'].shape, y['pp'].shape)
print(X['all'].shape, y['all'].shape)

(23319, 104) (23319,)
(25139, 220) (25139,)
(18610, 18) (18610,)
(67068, 35) (67068,)


In [15]:
model_perfs = pd.DataFrame(columns=['Logistic','RandomForest','GradientBoosting'])
for key in ['excel','word','pp','all']:
########## WHEN PREDICTORS ARE BINARY (1 for Treatment, 0 otherwise) ###############
    logit = LogisticRegression().fit(X[key],y[key])
    rf = RandomForestClassifier(random_state=0).fit(X[key],y[key])
    gb = xgb.XGBClassifier(random_state=0, n_jobs=4).fit(X[key],y[key])
    
    log_acc = round(logit.score(X[key],y[key]),3)
    rf_acc = round(rf.score(X[key],y[key]),3)
    gb_acc = round(gb.score(X[key],y[key]),3)

    model_perfs.loc[key+'Accuracy']=[log_acc,rf_acc,gb_acc]



In [19]:
model_perfs

Unnamed: 0,Logistic,RandomForest,GradientBoosting
excelAccuracy,0.724,0.839,0.725
wordAccuracy,0.716,0.966,0.716
ppAccuracy,0.755,0.756,0.755
allAccuracy,0.73,0.756,0.73


## Excel

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X['excel'], y['excel'], test_size=0.2)

In [21]:
### ONLY RUN IF THERE IS NO OUTPUT ###
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

{'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'oob_score': True} -0.5730808817257358


In [47]:
print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))


rfc = RandomForestClassifier(random_state = 0, n_estimators = 100,
                      max_depth=5,oob_score=True)
rfc.fit(X_train,y_train)
print(rfc.score(X_test,y_test))

learners = rfc.feature_importances_.argsort()[::-1]

features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.01]
features



0.6775300171526587
0.7244854202401372


Unnamed: 0,FlightPair
0.063438,xls-ocpsisenabled-
0.053517,xls-wac-
0.052797,xls-licensingisenabled-
0.050381,xls-tabletextcontrastacccheck-
0.048666,xls-keyboardshortcutsdialog-
0.046095,xls-chartfloatingobjectcontrolfallback-
0.045815,xls-mergedecoupled-
0.040933,xls-newcopypasteexperienceacrosssessions-
0.033053,xls-usezonereallocforclonesaveenabled-
0.029782,xls-namedsheetviewspassivetest-


In [23]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.714622641509434  -->...
Score with Top  1  Features:
0.7152658662092625
Score with Top  2  Features:
0.7152658662092625
Score with Top  3  Features:
0.7152658662092625
Score with Top  4  Features:
0.7152658662092625
Score with Top  5  Features:
0.7152658662092625
Score with Top  10  Features:
0.7152658662092625
Score with Top  15  Features:
0.7152658662092625
Score with Top  20  Features:
0.7152658662092625
Score with Top  25  Features:
0.7152658662092625
Score with Top  50  Features:
0.714622641509434
Score with Top  75  Features:
0.714622641509434
Score with Top  100  Features:
0.714622641509434


In [None]:
print(logit.fit(X_train,y_train).score(X_test,y_test), ' -->...')
logit_rfe.score(test_rfe_X,test_rfe_y)

In [48]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,25)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


In [49]:
cols.extend(features.FlightPair.values.tolist())
cols = list(set(cols))

logit = sm.Logit(y['excel'],X['excel'].loc[:,cols])
flogit = logit.fit()
print(flogit.summary())

coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

         Current function value: 0.583730
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                23319
Model:                          Logit   Df Residuals:                    23287
Method:                           MLE   Df Model:                           31
Date:                Tue, 12 Nov 2019   Pseudo R-squ.:                0.008407
Time:                        16:58:27   Log-Likelihood:                -13612.
converged:                      False   LL-Null:                       -13727.
                                        LLR p-value:                 2.077e-32
                                                      coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------
xls-insertdatetimeshortcuts-                       14.7122   1507.978 



Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
xls-freemiumupsellheaderui-,0.382579,0.092857,4.120093,3.787201e-05,0.200583,0.564575,1.466061,1.222115,1.758701,2019-05-01 01:34:07,2019-10-30 23:05:07
xls-ocpsisenabled-,-0.316234,0.159234,-1.985976,0.047036,-0.628327,-0.004142,0.728889,0.533484,0.995867,2019-10-07 19:09:17,2019-10-30 23:40:17
xls-tabletextcontrastacccheck-,0.362393,0.069857,5.187636,2.129807e-07,0.225476,0.499311,1.436764,1.252919,1.647585,2019-05-01 02:10:10,2019-07-15 18:10:07
xls-slrcachecomboboxmenus-,0.463871,0.058488,7.931003,2.173838e-15,0.349236,0.578507,1.590218,1.417984,1.783373,2019-05-01 16:00:05,2019-10-30 23:40:17
xls-newcopypasteexperienceenhancement-,-0.244123,0.09805,-2.489769,0.01278262,-0.436298,-0.051948,0.783392,0.646425,0.949379,2019-06-24 18:59:14,2019-10-30 23:40:17
xls-satoriforthirdpartiesenabled-,0.331966,0.108398,3.062466,0.002195214,0.119509,0.544423,1.393705,1.126944,1.723613,2019-05-01 02:20:05,2019-08-20 12:40:10
xls-namedsheetviewspassivetest-,0.252846,0.048748,5.186802,2.139363e-07,0.157302,0.34839,1.287685,1.170349,1.416785,2019-05-02 14:35:04,2019-10-28 14:20:18
xls-keyboardshortcutsdialog-,0.596113,0.186726,3.192451,0.001410709,0.230137,0.962089,1.815051,1.258773,2.617159,2019-08-12 13:29:08,2019-10-30 23:40:17
xls-slrcachegalleries-,0.396684,0.060121,6.598098,4.164648e-11,0.278849,0.51452,1.486887,1.321608,1.672835,2019-05-31 18:09:09,2019-10-30 23:40:17
xls-ecsocsmocsitimeoutraisesirrecoverableerror-,0.254157,0.059363,4.281404,1.857176e-05,0.137808,0.370506,1.289374,1.147755,1.448468,2019-07-02 06:54:59,2019-08-12 15:19:05


## Word

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X['word'], y['word'], test_size=0.2)

In [None]:
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

In [None]:
CV_rfc

In [51]:
print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))
rfc = RandomForestClassifier(random_state = 0, n_estimators = 100,
                      max_depth=5,oob_score=True)

rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)



0.6666666666666666


0.7175815433571997

In [52]:
learners = rfc.feature_importances_.argsort()[::-1]
features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.01]
features

Unnamed: 0,FlightPair
0.054956,wac-wordaugloopoperations-
0.046067,wac-box4augloopnodeeventsfiringinterval-500ms-
0.044586,wac-wordkeepmaxlastknownheight-
0.041598,wac-box4augloopannotations-
0.039503,wac-ocpsisenabled-
0.031815,wac-wordeditorcashdashisactivecoauth-
0.030587,wac-wordstatefulaugloop-
0.029935,wac-wordreacttaskpaneinfrastructure-
0.026782,wac-box4augloopannotationsattaching-
0.026383,wac-licensingisenabled-


In [58]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.7151949085123309  -->...
Score with Top  1  Features:
0.7175815433571997
Score with Top  2  Features:
0.7175815433571997
Score with Top  3  Features:
0.7175815433571997
Score with Top  4  Features:
0.7175815433571997
Score with Top  5  Features:
0.7175815433571997
Score with Top  10  Features:
0.7175815433571997
Score with Top  15  Features:
0.7175815433571997
Score with Top  20  Features:
0.7175815433571997
Score with Top  25  Features:
0.7175815433571997
Score with Top  50  Features:
0.7167859984089101
Score with Top  75  Features:
0.7165871121718377
Score with Top  100  Features:
0.7165871121718377


In [59]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,25)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

In [60]:
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()

In [61]:
cols.extend(features.FlightPair.values.tolist())

In [62]:
cols

['wac-box4augloopnodeeventsfiringinterval-500ms-',
 'wac-wordcontextextractionrollout-',
 'wac-worddeletefrominputevent-',
 'wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-',
 'wac-worddownloadcollabfeaturesearlier-',
 'wac-wordeditoratmentionsbookmarkendfix-',
 'wac-wordeditorfirstrunexperiencedatastore-',
 'wac-wordeditorhelptabwhatsnewenabled-',
 'wac-wordeditorserviceverificationloopab-',
 'wac-wordfindsessiononboot-',
 'wac-wordideas-',
 'wac-wordimagegroupcontainerfittopage-',
 'wac-wordkeepmaxlastknownheight-',
 'wac-wordlanguageintelligentsuggestion-',
 'wac-wordlogtasksdetailsfromcpumeter-',
 'wac-wordmergedparagraphwordidmappingimprovements-',
 'wac-wordmergestyletoggleproperties-',
 'wac-wordmixedjustificationparagraphmultiselectjustify-',
 'wac-wordnumberedlistindentation-',
 'wac-wordreactformatpicturetaskpane-',
 'wac-wordstatefulaugloop-',
 'wac-wordupdatedcontextualtabvisuals-',
 'wac-wordupdatedtellmeplaceholder-',
 'wac-wordversionhistoryreactpdf-',
 'wac-

In [63]:
cols = list(set(cols))

logit = sm.Logit(y['word'],X['word'].loc[:,cols])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

Optimization terminated successfully.
         Current function value: 0.597685
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                25139
Model:                          Logit   Df Residuals:                    25100
Method:                           MLE   Df Model:                           38
Date:                Tue, 12 Nov 2019   Pseudo R-squ.:              -0.0008320
Time:                        17:21:18   Log-Likelihood:                -15025.
converged:                       True   LL-Null:                       -15013.
                                        LLR p-value:                     1.000
                                                                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------------------
wac-imagen

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
wac-wordmixedjustificationparagraphmultiselectjustify-,0.583527,0.180165,3.23885,0.001200128,0.23041,0.936643,1.792349,1.259116,2.551403,2019-05-29 12:19:23,2019-09-15 22:30:07
wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-,-0.307274,0.13161,-2.334725,0.01955777,-0.565225,-0.049322,0.735449,0.568232,0.951874,2019-09-27 22:35:22,2019-10-30 22:45:13
wac-wordreacttaskpanewithmlr-,0.229835,0.107599,2.136042,0.03267598,0.018946,0.440724,1.258392,1.019126,1.553832,2019-05-16 13:34:08,2019-09-20 17:39:49
wac-wordeditorfirstrunexperiencedatastore-,-0.1939,0.088409,-2.193216,0.02829181,-0.367179,-0.020622,0.82374,0.692686,0.97959,2019-06-14 02:59:07,2019-08-22 20:29:18
wac-wordideas-,0.258515,0.025913,9.976362,1.934286e-23,0.207727,0.309303,1.295005,1.230877,1.362475,2019-07-14 01:45:06,2019-10-30 23:50:06
wac-wordlogtasksdetailsfromcpumeter-,0.58967,0.194377,3.033633,0.002416281,0.208697,0.970642,1.803393,1.232072,2.639639,2019-06-20 18:59:08,2019-07-05 16:05:08
wac-wordimagegroupcontainerfittopage-,0.505112,0.121122,4.170276,3.042311e-05,0.267717,0.742506,1.65717,1.306977,2.101195,2019-10-09 00:05:40,2019-10-30 23:50:28
wac-wordeditoratmentionsbookmarkendfix-,0.367735,0.112612,3.265517,0.001092643,0.147021,0.58845,1.44446,1.158378,1.801195,2019-10-02 14:34:22,2019-10-30 23:50:28
wac-wordeditorcashdashisactivecoauth-,0.169727,0.062664,2.708538,0.006758045,0.046908,0.292545,1.184981,1.048026,1.339833,2019-09-24 18:20:12,2019-10-30 23:50:28
wac-wordversionhistoryreactpdf-,-0.395999,0.119888,-3.303082,0.0009562858,-0.630974,-0.161023,0.673008,0.532073,0.851272,2019-09-03 18:59:53,2019-10-30 23:30:23


If we filter for just StatSig Predictors from model above, it does not improve Log Likelihood:

In [66]:
logit = sm.Logit(y['word'],X['word'].loc[:,coefficients.index])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

Optimization terminated successfully.
         Current function value: 0.598956
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                25139
Model:                          Logit   Df Residuals:                    25123
Method:                           MLE   Df Model:                           15
Date:                Tue, 12 Nov 2019   Pseudo R-squ.:               -0.002959
Time:                        17:24:55   Log-Likelihood:                -15057.
converged:                       True   LL-Null:                       -15013.
                                        LLR p-value:                     1.000
                                                                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------------------
wac-wordmi

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
wac-wordmixedjustificationparagraphmultiselectjustify-,0.797318,0.109801,7.2615,3.828218e-13,0.582113,1.012524,2.219581,1.789816,2.752539,2019-05-29 12:19:23,2019-09-15 22:30:07
wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-,-0.305205,0.130606,-2.336844,0.01944728,-0.561187,-0.049223,0.736972,0.570531,0.951969,2019-09-27 22:35:22,2019-10-30 22:45:13
wac-wordideas-,0.18973,0.052311,3.62697,0.0002867668,0.087202,0.292257,1.208923,1.091118,1.339447,2019-07-14 01:45:06,2019-10-30 23:50:06
wac-wordlogtasksdetailsfromcpumeter-,0.634619,0.192579,3.295377,0.0009828972,0.257172,1.012066,1.886303,1.293267,2.751279,2019-06-20 18:59:08,2019-07-05 16:05:08
wac-wordimagegroupcontainerfittopage-,0.444843,0.088852,5.006567,5.540946e-07,0.270696,0.618989,1.560245,1.310877,1.85705,2019-10-09 00:05:40,2019-10-30 23:50:28
wac-wordeditoratmentionsbookmarkendfix-,0.409043,0.11023,3.710818,0.0002065902,0.192996,0.625089,1.505376,1.212878,1.868412,2019-10-02 14:34:22,2019-10-30 23:50:28
wac-wordeditorcashdashisactivecoauth-,0.145316,0.052064,2.791103,0.00525287,0.043272,0.247359,1.156405,1.044222,1.280639,2019-09-24 18:20:12,2019-10-30 23:50:28
wac-wordversionhistoryreactpdf-,-0.503149,0.11064,-4.547612,5.425796e-06,-0.720001,-0.286298,0.604623,0.486752,0.751039,2019-09-03 18:59:53,2019-10-30 23:30:23
wac-wordimproveboldexperienceonmacchrome-,-0.532039,0.097025,-5.483501,4.169892e-08,-0.722205,-0.341873,0.587406,0.48568,0.710439,2019-06-24 19:34:36,2019-09-15 10:19:26
wac-wordslrcachecolorpickers-,0.504629,0.034294,14.715008,5.163984e-49,0.437415,0.571843,1.656372,1.548699,1.77153,2019-05-16 00:19:19,2019-10-20 10:14:11


## PowerPoint

In [54]:
print('PowerPoint AllUp MODEL')
logit = sm.Logit(y['pp'],X['pp'])
logit.
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

PowerPoint AllUp MODEL
Optimization terminated successfully.
         Current function value: 0.567854
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                18610
Model:                          Logit   Df Residuals:                    18592
Method:                           MLE   Df Model:                           17
Date:                Wed, 18 Dec 2019   Pseudo R-squ.:                -0.01950
Time:                        11:17:58   Log-Likelihood:                -10568.
converged:                       True   LL-Null:                       -10366.
                                        LLR p-value:                     1.000
                                                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------
ppt-appcommandsquickc

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
ppt-formatpainterapplyremapped-,0.553948,0.045943,12.057366,1.773647e-33,0.463902,0.643994,1.74011,1.590267,1.904071,2019-07-04 02:04:08,2019-09-11 02:49:10
ppt-freemiumupsellheaderui-,-0.175179,0.097833,-1.790595,0.07335833,-0.366928,0.01657,0.839307,0.69286,1.016708,2019-05-01 01:34:07,2019-10-30 23:05:07
ppt-helptabwhatsnewenabled-,0.43941,0.089722,4.897484,9.707166e-07,0.263559,0.615262,1.551792,1.301554,1.850141,2019-05-31 04:59:17,2019-10-30 23:25:11
ppt-linktoslide-,0.210955,0.095621,2.206153,0.0273733,0.023541,0.398369,1.234857,1.02382,1.489393,2019-07-09 22:30:07,2019-10-30 14:30:21
ppt-newcommentdraftingapiisenabled-,0.118622,0.048484,2.446626,0.01442003,0.023595,0.213648,1.125944,1.023876,1.238187,2019-05-03 18:29:05,2019-08-22 20:34:08
ppt-newideas-,0.615018,0.029869,20.590226,3.357974e-94,0.556475,0.673561,1.84969,1.744512,1.961209,2019-05-31 02:04:12,2019-10-30 23:45:13
ppt-pptnewcommentbuttoninpane-,0.790493,0.081474,9.702372,2.9457070000000002e-22,0.630807,0.95018,2.204483,1.879126,2.586174,2019-05-01 00:04:11,2019-07-18 07:46:00
ppt-pptsharedcommentsfluentui-,0.347758,0.043024,8.082956,6.321536e-16,0.263433,0.432083,1.41589,1.30139,1.540463,2019-08-21 14:34:14,2019-10-30 23:45:13
ppt-textformatpainter-,-0.184535,0.088109,-2.094391,0.03622515,-0.357226,-0.011844,0.831491,0.699614,0.988226,2019-05-06 08:05:42,2019-07-11 09:54:30
ppt-wopitokenrefresh-,0.499106,0.069061,7.226994,4.938007e-13,0.363748,0.634463,1.647248,1.438712,1.88601,2019-05-01 00:04:11,2019-10-30 23:24:12


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

In [17]:
coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB
ppt-formatpainterapplyremapped-,0.553948,0.045943,12.057366,1.773647e-33,0.463902,0.643994,1.74011,1.590267,1.904071
ppt-freemiumupsellheaderui-,-0.175179,0.097833,-1.790595,0.07335833,-0.366928,0.01657,0.839307,0.69286,1.016708
ppt-helptabwhatsnewenabled-,0.43941,0.089722,4.897484,9.707166e-07,0.263559,0.615262,1.551792,1.301554,1.850141
ppt-linktoslide-,0.210955,0.095621,2.206153,0.0273733,0.023541,0.398369,1.234857,1.02382,1.489393
ppt-newcommentdraftingapiisenabled-,0.118622,0.048484,2.446626,0.01442003,0.023595,0.213648,1.125944,1.023876,1.238187
ppt-newideas-,0.615018,0.029869,20.590226,3.357974e-94,0.556475,0.673561,1.84969,1.744512,1.961209
ppt-pptnewcommentbuttoninpane-,0.790493,0.081474,9.702372,2.9457070000000002e-22,0.630807,0.95018,2.204483,1.879126,2.586174
ppt-pptsharedcommentsfluentui-,0.347758,0.043024,8.082956,6.321536e-16,0.263433,0.432083,1.41589,1.30139,1.540463
ppt-textformatpainter-,-0.184535,0.088109,-2.094391,0.03622515,-0.357226,-0.011844,0.831491,0.699614,0.988226
ppt-wopitokenrefresh-,0.499106,0.069061,7.226994,4.938007e-13,0.363748,0.634463,1.647248,1.438712,1.88601


In [20]:
1 - coefficients['Odds Ratio']

ppt-formatpainterapplyremapped-       -0.740110
ppt-freemiumupsellheaderui-            0.160693
ppt-helptabwhatsnewenabled-           -0.551792
ppt-linktoslide-                      -0.234857
ppt-newcommentdraftingapiisenabled-   -0.125944
ppt-newideas-                         -0.849690
ppt-pptnewcommentbuttoninpane-        -1.204483
ppt-pptsharedcommentsfluentui-        -0.415890
ppt-textformatpainter-                 0.168509
ppt-wopitokenrefresh-                 -0.647248
Name: Odds Ratio, dtype: float64

In [34]:
def odd_v_prob(p1,p2):
    '''representing change in probability and corresponding change in odds
    p1: initial probability
    p2: updated probability'''
    p_change = p2-p1
    o1 = p1/(1-p1)
    o2 = p2/(1-p2)
    o_change = o2-o1
    return p_change, o_change

print(odd_v_prob(0.1,0.6))
print(odd_v_prob(0.2,0.7))
print(odd_v_prob(0.3,0.8))

(0.5, 1.3888888888888886)
(0.49999999999999994, 2.083333333333333)
(0.5, 3.571428571428572)


In [35]:
coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB
ppt-formatpainterapplyremapped-,0.553948,0.045943,12.057366,1.773647e-33,0.463902,0.643994,1.74011,1.590267,1.904071
ppt-freemiumupsellheaderui-,-0.175179,0.097833,-1.790595,0.07335833,-0.366928,0.01657,0.839307,0.69286,1.016708
ppt-helptabwhatsnewenabled-,0.43941,0.089722,4.897484,9.707166e-07,0.263559,0.615262,1.551792,1.301554,1.850141
ppt-linktoslide-,0.210955,0.095621,2.206153,0.0273733,0.023541,0.398369,1.234857,1.02382,1.489393
ppt-newcommentdraftingapiisenabled-,0.118622,0.048484,2.446626,0.01442003,0.023595,0.213648,1.125944,1.023876,1.238187
ppt-newideas-,0.615018,0.029869,20.590226,3.357974e-94,0.556475,0.673561,1.84969,1.744512,1.961209
ppt-pptnewcommentbuttoninpane-,0.790493,0.081474,9.702372,2.9457070000000002e-22,0.630807,0.95018,2.204483,1.879126,2.586174
ppt-pptsharedcommentsfluentui-,0.347758,0.043024,8.082956,6.321536e-16,0.263433,0.432083,1.41589,1.30139,1.540463
ppt-textformatpainter-,-0.184535,0.088109,-2.094391,0.03622515,-0.357226,-0.011844,0.831491,0.699614,0.988226
ppt-wopitokenrefresh-,0.499106,0.069061,7.226994,4.938007e-13,0.363748,0.634463,1.647248,1.438712,1.88601


In [52]:
prob_interpretation = coefficients['Odds Ratio'].round(1)*0.5 #- 0.5

In [53]:
prob_interpretation.mask(prob_interpretation>1,0.99#,0.5
                        )

ppt-formatpainterapplyremapped-        0.85
ppt-freemiumupsellheaderui-            0.40
ppt-helptabwhatsnewenabled-            0.80
ppt-linktoslide-                       0.60
ppt-newcommentdraftingapiisenabled-    0.55
ppt-newideas-                          0.90
ppt-pptnewcommentbuttoninpane-         0.99
ppt-pptsharedcommentsfluentui-         0.70
ppt-textformatpainter-                 0.40
ppt-wopitokenrefresh-                  0.80
Name: Odds Ratio, dtype: float64

## Overall

Flights that appear in each of the three apps

In [42]:
print('SATURATED MODEL')
logit = sm.Logit(y['all'],X['all'])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

SATURATED MODEL
Optimization terminated successfully.
         Current function value: 0.593597
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                67068
Model:                          Logit   Df Residuals:                    67034
Method:                           MLE   Df Model:                           33
Date:                Tue, 12 Nov 2019   Pseudo R-squ.:                -0.01688
Time:                        16:45:19   Log-Likelihood:                -39811.
converged:                       True   LL-Null:                       -39150.
                                        LLR p-value:                     1.000
                                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
wac-minimizeintelligentplaceholderwork-,0.466213,0.131702,3.539903,0.0004002744,0.208081,0.724345,1.593947,1.231313,2.063378,2019-09-03 23:49:07,2019-09-27 11:34:06
wac-wordwordoauthtestappaadtokenretrieval-,0.230984,0.025354,9.110206,8.222533999999999e-20,0.18129,0.280678,1.259839,1.198763,1.324027,2019-06-12 12:04:04,2019-10-30 23:50:06
wac-wordwordoauthtestappmsatokenretrieval-,0.861096,0.152628,5.641779,1.683018e-08,0.56195,1.160243,2.365753,1.754089,3.190707,2019-09-03 18:54:16,2019-10-30 23:50:28
wac-wordimagegroupcontainerfittopage-,0.229404,0.076603,2.994726,0.002746917,0.079265,0.379542,1.25785,1.082492,1.461615,2019-10-09 00:05:40,2019-10-30 23:50:28
wac-wordshowpasteoptionsincontextmenu-,0.309346,0.055141,5.610067,2.022484e-08,0.201271,0.417421,1.362534,1.222956,1.518041,2019-07-08 18:14:05,2019-10-07 10:04:14
wac-worduseaadforfileupload-,0.348401,0.033022,10.55051,5.052222e-26,0.283679,0.413123,1.4168,1.328006,1.511531,2019-07-10 18:59:06,2019-09-10 20:10:08
firstrelease-,0.339599,0.028829,11.77976,4.963447e-32,0.283095,0.396103,1.404385,1.327232,1.486023,2019-05-01 00:04:11,2019-10-30 23:50:28
xls-overridebrowsershortcuts-,0.338301,0.030835,10.971439,5.243130000000001e-28,0.277866,0.398736,1.402563,1.32031,1.489941,2019-08-25 10:09:31,2019-10-30 23:40:17
wac-wordeditorcashdashisactivecoauth-,-0.255894,0.043411,-5.8947,3.753617e-09,-0.340978,-0.17081,0.774224,0.711075,0.842981,2019-09-24 18:20:12,2019-10-30 23:50:28
wac-wordkeepmaxlastknownheight-,0.201369,0.043224,4.658682,3.18241e-06,0.116651,0.286087,1.223076,1.123727,1.331208,2019-09-19 18:34:06,2019-10-30 23:50:28


In [None]:
flight_durations