Objective: Expand NPS Predictive Model based on Flight Control/Treatment pairs to newer datasets and also to more apps than just Excel.

In [2]:
!pip install graphviz

Collecting graphviz
  Downloading https://files.pythonhosted.org/packages/f5/74/dbed754c0abd63768d3a7a7b472da35b08ac442cf87d73d5850a6f32391e/graphviz-0.13.2-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.13.2


You are using pip version 19.0.3, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [267]:
def odd_v_prob(p1,p2):
    '''representing change in probability and corresponding change in odds
    p1: initial probability
    p2: updated probability'''
    p_change = p2-p1
    o1 = p1/(1-p1)
    o2 = p2/(1-p2)
    o_change = o2-o1
    return p_change, o_change

print(odd_v_prob(0.1,0.6))
print(odd_v_prob(0.2,0.7))
print(odd_v_prob(0.3,0.8))

(0.5, 1.3888888888888886)
(0.49999999999999994, 2.083333333333333)
(0.5, 3.571428571428572)


In [352]:
import math
import pandas as pd
import numpy as np
import datetime
import sklearn.tree
from graphviz import Source
from IPython.display import SVG
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
import os

from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

def transform_rating(rating):
    '''Input: Column of Data with NPS Field (on a scale of 1 to 5...)
    Output: Column with either Promoter/Detractor/Passive label, or the corresponding weights, based on datatype'''
    
    if rating == 5:
        return 100
    elif rating <= 3:
        return -100
    elif rating == 4:
        return 0
    else:
        return np.nan

In [353]:
Word_df = pd.read_csv('Word_updated_dataNov.tsv', sep='\t')
Excel_df = pd.read_csv('Excel_updated_dataNov.tsv', sep='\t')
PP_df = pd.read_csv('PowerPoint_updated_dataNov.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [354]:
PP_df.head()

Unnamed: 0,OcvId,ProcessSessionId,Date,LongDate,Platform,Product,Rating,SurveyRatingScale,OriginalText,Verbatim,...,Skus,rn,FlightId,WacSessionId,WACSessionID,Application,ApplicationMode,ApplicationLCID,Host,rn1
0,flnps_v2_b59dd6b09c805661bc7face573961b82,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,10/27/2019 2:04:04 PM,2019-10-27,Web,PowerPoint,5,5,"Facilita a edição, é prático e oferece bons de...","Facilitates editing, is practical and offers g...",...,,1,canary-c,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,477e30ec-a5ad-4052-af7a-5cbedcc7d96b,PowerPoint,Unified,pt-BR,OneDriveWOPI,1
1,flnps_v2_b3397471bc1158b9b359291a1da871ab,481cce98-912d-66ac-366b-81cb3731d61a,6/4/2019 1:34:26 PM,2019-06-04,Web,PowerPoint,3,5,,,...,,1,canary-c,481cce98-912d-66ac-366b-81cb3731d61a,481cce98-912d-66ac-366b-81cb3731d61a,PowerPoint,Unified,en-US,SharePoint Online,1
2,flnps_v2_cb448186b6ad52809d8199a77b9772ae,4fd742a2-1196-486b-9518-68b8530a1fe4,5/4/2019 8:10:28 PM,2019-05-04,Web,PowerPoint,5,5,,,...,,1,canary-t,4fd742a2-1196-486b-9518-68b8530a1fe4,4fd742a2-1196-486b-9518-68b8530a1fe4,PowerPoint,Unified,pt-BR,OneDriveWOPI,1
3,flnps_v2_6722df959f745987b2718d2ae4b17606,20aeb498-8576-4915-a6e1-fa4697445d3c,5/31/2019 7:55:06 PM,2019-05-31,Web,PowerPoint,5,5,,,...,,1,canary-t,20aeb498-8576-4915-a6e1-fa4697445d3c,20aeb498-8576-4915-a6e1-fa4697445d3c,PowerPoint,Unified,en-US,SharePoint Online,1
4,flnps_v2_5eeca61dece15306ac756297f59dccbe,01339fec-c71d-4cd9-a486-22a0c14c5a49,7/1/2019 2:25:29 PM,2019-07-01,Web,PowerPoint,3,5,,,...,,1,afd_ignorewaccluster,01339fec-c71d-4cd9-a486-22a0c14c5a49,01339fec-c71d-4cd9-a486-22a0c14c5a49,PowerPoint,Unified,en-US,OneDriveWOPI,1


In [355]:
def make_AB_dataset(df,prefixes):
    '''df: initial input dataframe of flight data
    prefixes: flights have prefixes - this arg is a list for which prefixes we want to filter on'''
    df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    #df['FlightPair'].replace('docowner-canary','canary-docowner',inplace=True)
    
    
    ab_df = df[df.FlightId.notnull()]
    ab_df.drop_duplicates(keep='last', inplace=True)
    print(ab_df.shape, ' before filtering out non-pairs')
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    print(ab_df.shape, ' after filtering out non-pairs')
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Flight'] = 1
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Flight'] = 1
    ab_df = ab_df[ab_df.Flight.notnull()]
    ab_df['NPS'] = ab_df['Rating'].apply(transform_rating)
    ab_df = ab_df.sort_values(by='Date')
    value_key = ab_df.sort_values(by='Date').groupby(['OcvId'])['NPS'].last().to_dict()
    exp_df = ab_df.groupby(['OcvId','FlightPair'])['Flight'].last().unstack()
    print('Feature Matrix should have ',ab_df.OcvId.nunique(), ' rows and ',ab_df.FlightPair.nunique(),' columns')
    print('Final Shape:',exp_df.shape)
    if prefixes: #i.e. if the input list is empty:
        for p in prefixes:
            exp_df = exp_df.iloc[:,exp_df.columns.str.startswith(p)]
    exp_df['NPS'] = exp_df.index.map(value_key)
    return exp_df.fillna(0)

#excel_df = make_AB_dataset(Excel_df,['xls'])

In [356]:
####### JUST READ THE CSV IF YOUVE RAN ALREADY #######

def get_flight_durations():
    ''' no inputs/arguments, just make sure you have all of the Tabular Flight data files you are using.'''
    df = pd.concat([Excel_df[Excel_df['AudienceGroup']=='Production'][Excel_df.FlightId.notnull()],
                    Word_df[Word_df['AudienceGroup']=='Production'][Word_df.FlightId.notnull()],
                    PP_df[PP_df['AudienceGroup']=='Production'][PP_df.FlightId.notnull()]])
    #df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    
    print('Flight Pairs Assigned')
    
    ab_df = df[df.FlightPair.notnull()]
    #ab_df.drop_duplicates(keep='last', inplace=True)
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    flight_starts = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].first()
    flight_ends = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].last()
    flight_durations = pd.concat([flight_starts,flight_ends],axis=1)
    flight_durations.columns = ['FlightStart','FlightEnd']
    return flight_durations
    
flight_durations = get_flight_durations()

Flight Pairs Assigned


In [27]:
flight_durations.to_csv('FlightsAsOfNovember.csv')


In [357]:
excel_df = make_AB_dataset(Excel_df,[])
word_df = make_AB_dataset(Word_df,[])
pp_df = make_AB_dataset(PP_df,[])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(1338018, 40)  before filtering out non-pairs
(1012893, 40)  after filtering out non-pairs
Feature Matrix should have  31526  rows and  168  columns
Final Shape: (31526, 168)
(2203599, 40)  before filtering out non-pairs
(1694175, 40)  after filtering out non-pairs
Feature Matrix should have  34222  rows and  285  columns
Final Shape: (34222, 285)
(613317, 40)  before filtering out non-pairs
(370950, 40)  after filtering out non-pairs
Feature Matrix should have  24430  rows and  68  columns
Final Shape: (24430, 68)


In [358]:
excel_flights = list(excel_df.columns)
word_flights = list(word_df.columns)
common_flights = list(set(excel_flights).intersection(word_flights))
pp_flights = list(pp_df.columns)
common_flights = list(set(common_flights).intersection(pp_flights))

In [359]:
common_flights.remove('NPS')

In [360]:
common_flights

['wac-minimizeintelligentplaceholderwork-',
 'wac-oauthtestappmsatokenretrieval-',
 'xls-sheettabbarcontroltouchscroll-',
 'wac-mergemissingeopfix-',
 'wac-worduseaadforfileupload-',
 'firstrelease-',
 'wac-wordkeepmaxlastknownheight-',
 'wac-worduseoauthfullyfordlp-',
 'wac-wordcloneatmentionpropertiesinhyperlink-',
 'canary-user-',
 'xls-overridebrowsershortcuts-',
 'docowneridtestaa-',
 'wac-wordclearformattingtrackchanges-',
 'xls-removescrolltofirstandlastsheettabnavbuttons-',
 'wac-wordeditorcashdashisactivecoauth-',
 'wac-wordsdxpreloadpackagelist-',
 'wac-wordpreserveverbonredirect-',
 'wac-wordshowpasteoptionsincontextmenu-',
 'canary-',
 'canary-docowner-',
 'wac-wordwordoauthtestappaadtokenretrieval-',
 'wac-box4directprinthidesensitiveparameters-',
 'wac-ocpsisenabled-',
 'wac-worddefinenewbullettablecellclassname-',
 'wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-',
 'wac-wordrendervisibleoutlinegroupsperf-',
 'canary2-',
 'wac-licensingisenabled-',
 'wac-word

In [361]:
dc = ['docowner-canary-','canary-docowner-',
     'canary2','canary-']

In [362]:
X = {'excel':excel_df.iloc[:,excel_df.columns.str.startswith('xls')],
     'word':word_df.iloc[:,word_df.columns.str.startswith('wac')],
     'pp':pp_df.iloc[:,pp_df.columns.str.startswith('pp')],
     'all':pd.concat([excel_df,word_df,pp_df]).loc[:,common_flights]}
y = {'excel':excel_df['NPS'].replace([100,0],1).replace(-100,0)
     ,'word':word_df['NPS'].replace([100,0],1).replace(-100,0),
     'pp':pp_df['NPS'].replace([100,0],1).replace(-100,0),
     'all':pd.concat([excel_df,word_df,pp_df])['NPS'].replace([100,0],1).replace(-100,0)}

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [363]:
print(X['excel'].shape, y['excel'].shape)
print(X['word'].shape, y['word'].shape)
print(X['pp'].shape, y['pp'].shape)
print(X['all'].shape, y['all'].shape)

(31526, 126) (31526,)
(34222, 262) (34222,)
(24430, 20) (24430,)
(90178, 38) (90178,)


In [15]:
model_perfs = pd.DataFrame(columns=['Logistic','RandomForest','GradientBoosting'])
for key in ['excel','word','pp','all']:
########## WHEN PREDICTORS ARE BINARY (1 for Treatment, 0 otherwise) ###############
    logit = LogisticRegression().fit(X[key],y[key])
    rf = RandomForestClassifier(random_state=0).fit(X[key],y[key])
    gb = xgb.XGBClassifier(random_state=0, n_jobs=4).fit(X[key],y[key])
    
    log_acc = round(logit.score(X[key],y[key]),3)
    rf_acc = round(rf.score(X[key],y[key]),3)
    gb_acc = round(gb.score(X[key],y[key]),3)

    model_perfs.loc[key+'Accuracy']=[log_acc,rf_acc,gb_acc]



In [16]:
model_perfs

Unnamed: 0,Logistic,RandomForest,GradientBoosting
excelAccuracy,0.703,0.85,0.703
wordAccuracy,0.698,0.967,0.699
ppAccuracy,0.746,0.748,0.746
allAccuracy,0.713,0.753,0.713


## Excel

In [364]:
X_train, X_test, y_train, y_test = train_test_split(X['excel'], y['excel'], test_size=0.2)

In [37]:
### ONLY RUN IF THERE IS NO OUTPUT ###
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    #'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,25,None]}
    #'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5

{'max_depth': 10, 'max_features': 'auto', 'n_estimators': 250, 'oob_score': True} -0.5931484826288238


In [365]:
print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))


rfc = RandomForestClassifier(random_state = 0, n_estimators = 250,
                      max_depth=10,oob_score=True)
rfc.fit(X_train,y_train)
print(rfc.score(X_test,y_test))

learners = rfc.feature_importances_.argsort()[::-1]

features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.01]
features



0.6489058039961941
0.703615604186489


Unnamed: 0,FlightPair
0.029021,xls-ocpsisenabled-
0.026197,xls-licensingisenabled-
0.02575,xls-ideas-
0.024667,xls-namedsheetviewspassivetest-
0.023259,xls-wac-
0.02302,xls-usezonereallocforclonesaveenabled-
0.021008,xls-cardviewshowricherrors-
0.020683,xls-ribbonresourcespreloading-
0.019409,xls-newcopypasteexperienceacrosssessions-
0.019058,xls-nativeopenworkbookmeasurements-


In [39]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.7007611798287345  -->...
Score with Top  1  Features:
0.7004440215667618
Score with Top  2  Features:
0.7004440215667618
Score with Top  3  Features:
0.7004440215667618
Score with Top  4  Features:
0.7004440215667618
Score with Top  5  Features:
0.7004440215667618
Score with Top  10  Features:
0.7004440215667618
Score with Top  15  Features:
0.7004440215667618
Score with Top  20  Features:
0.7004440215667618
Score with Top  25  Features:
0.7004440215667618
Score with Top  50  Features:
0.7006026006977482
Score with Top  75  Features:
0.7004440215667618
Score with Top  100  Features:
0.7007611798287345


In [40]:
print(logit.fit(X_train,y_train).score(X_test,y_test), ' -->...')
logit_rfe.score(test_rfe_X,test_rfe_y)

0.7007611798287345  -->...


0.7007611798287345

In [366]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,5)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()

In [367]:
cols.extend(features.FlightPair.values.tolist())
cols = list(set(cols))

In [247]:
rank_test = (sm.add_constant(X['excel'].loc[:,cols].T.drop_duplicates().T))
for i in range(rank_test.shape[1]):
    df_to_rank = rank_test.iloc[:,:i+1]
    print(i,np.linalg.matrix_rank(df_to_rank))

0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
30 31
31 32
32 33
33 34
34 35
35 36


In [248]:
matrix = (X['excel'].loc[:,cols]).corr()==1
linear_dependents = []
for col in matrix.columns:
    #print(col)
    linear_dependents.append(list(matrix[col].index[matrix[col]==True]))
[x for x in linear_dependents if len(x)>1]

[['xls-licensingisenabled-', 'xls-wac-'],
 ['xls-licensingisenabled-', 'xls-wac-']]

In [249]:
matrix = (X['excel'].loc[:,cols]).corr()==-1
linear_combinations = []
for col in matrix.columns:
    #print(col)
    linear_combinations.append(list(matrix[col].index[matrix[col]==True]))
[x for x in linear_combinations if len(x)>1]

[]

In [252]:
X['excel'].loc[:,cols].head(3)

FlightPair,xls-editablelegacycomments-,xls-usenewviewportcalculationsforprefetch-,xls-mergedecoupled-,xls-licensingisenabled-,xls-gridcanvasrendering-,xls-tabletextcontrastacccheck-,xls-workbookcachedigestreset-,xls-xlowatermarkroundtrip-,xls-overridebrowsershortcuts-,xls-openworkbookdecoupled-,...,xls-newcopypasteexperienceacrosssessions-,xls-adaptivecards-,xls-newcommentbuttoninpane-,xls-sharedcommentsfluentui-,xls-helptabwhatsnewenabled-,xls-canvasboldanditalicrendering-,xls-retaintransactiononfailedcommit-,xls-commitandcanceleditbuttonsenabled-,xls-newcommentdraftingapiisenabled-,xls-bidataplatformforivye2o-
OcvId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
flnps_v2_0000d4e32b3b50fe8ed3f50c0e36feb3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
flnps_v2_00017f19ec855c33b3e3ec4c4294a106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
flnps_v2_0002e4096d8557cd84516b46d5a84fdc,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [253]:
sm.add_constant(X['excel']).head(3)

Unnamed: 0_level_0,const,xls-abortablerecalcoperationdecoupled-,xls-abortablerecalcoperationinterrupt-,xls-abortablerecalcoperationstage3decoupledrecalcabortable-,xls-adaptivecards-,xls-applyautomaticpolicylabel-,xls-applymanualpolicylabel-,xls-bidataplatformforcharte2o-,xls-bidataplatformforivye2o-,xls-cacheformattedcell-,...,xls-updaterevisiondontsendknownblobs-,xls-usecobaltstreamingduringdownload-,xls-usenewviewportcalculationsforprefetch-,xls-useractivityhostnotifier-,xls-usezonereallocforclonesaveenabled-,xls-versionhistory-,xls-viewportimprovements-,xls-wac-,xls-workbookcachedigestreset-,xls-xlowatermarkroundtrip-
OcvId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
flnps_v2_0000d4e32b3b50fe8ed3f50c0e36feb3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
flnps_v2_00017f19ec855c33b3e3ec4c4294a106,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
flnps_v2_0002e4096d8557cd84516b46d5a84fdc,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [368]:
logit = sm.Logit(y['excel'],(sm.add_constant(X['excel'].loc[:,cols].T.drop_duplicates().T)))

flogit = logit.fit()
print(flogit.summary())

coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients['FinalProbability'] = np.exp(coefficients['Coef.'].round(1))*y['excel'].value_counts(normalize=True).loc[1] #- 0.5
coefficients['FinalProbability'] = coefficients['FinalProbability'].mask(coefficients['FinalProbability']>=1,0.99)
coefficients['Lift'] = coefficients['FinalProbability'] - y['excel'].value_counts(normalize=True).loc[1]
excel_coefficients = coefficients.join(flight_durations)
excel_coefficients['Type'] = 'Excel'

  return ptp(axis=axis, out=out, **kwargs)


         Current function value: 0.591837
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                31526
Model:                          Logit   Df Residuals:                    31487
Method:                           MLE   Df Model:                           38
Date:                Fri, 10 Jan 2020   Pseudo R-squ.:                 0.02784
Time:                        11:51:20   Log-Likelihood:                -18658.
converged:                      False   LL-Null:                       -19193.
Covariance Type:            nonrobust   LLR p-value:                1.803e-199
                                                                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------------------
const                                     



In [274]:
y['excel'].value_counts(normalize=True).loc[0]

0.29756391549831884

## Word

In [369]:
X_train, X_test, y_train, y_test = train_test_split(X['word'], y['word'], test_size=0.2)

In [44]:
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    #'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None]}
    #'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  8

{'max_depth': 10, 'max_features': 'auto', 'n_estimators': 500, 'oob_score': True} -0.5995307756427593


In [45]:
CV_rfc

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max

In [370]:
print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))
rfc = RandomForestClassifier(random_state = 0, n_estimators = 500,
                      max_depth=10,oob_score=True)

rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

learners = rfc.feature_importances_.argsort()[::-1]
features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.01]
features

In [48]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.695690284879474  -->...
Score with Top  1  Features:
0.6961285609934259
Score with Top  2  Features:
0.6961285609934259
Score with Top  3  Features:
0.6961285609934259
Score with Top  4  Features:
0.6961285609934259
Score with Top  5  Features:
0.6961285609934259
Score with Top  10  Features:
0.6955441928414902
Score with Top  15  Features:
0.6955441928414902
Score with Top  20  Features:
0.6953981008035062
Score with Top  25  Features:
0.6952520087655223
Score with Top  50  Features:
0.695690284879474
Score with Top  75  Features:
0.693644996347699
Score with Top  100  Features:
0.6948137326515705


In [371]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,5)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

In [382]:
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()

In [383]:
cols.extend(features.FlightPair.values.tolist())

In [384]:
cols

['wac-wordkeepmaxlastknownheight-',
 'wac-wordmergedparagraphwordidmappingimprovements-',
 'wac-wordmultitouchgestureword-',
 'wac-wordreactmultilineribbon-',
 'wac-wordwordcountforlistmarkerfix-',
 'wac-wordaugloopoperations-',
 'wac-box4augloopnodeeventsfiringinterval-500ms-',
 'wac-wordkeepmaxlastknownheight-',
 'wac-box4augloopannotationsattaching-',
 'wac-box4augloopannotations-',
 'wac-wordaugloopserversideseeding-',
 'wac-wordstatefulaugloop-',
 'wac-wordideas-',
 'wac-wordinteractiveperfinertialscrollvisibility-',
 'wac-ocpsisenabled-',
 'wacdash-',
 'wac-wordreplicationintervalrealtimecoauthexperiment-',
 'wac-wordprefetchideasresources-',
 'wac-wordupdatecommanduiperfimprovement-',
 'wac-wordcomboprefetchideasassets-',
 'wac-wordprefetchofficejsbetaresources-']

In [145]:
matrix = X['word'].loc[:,cols].corr()==1
linear_dependents = []
for col in matrix.columns:
    print(col)
    linear_dependents.append(list(matrix[col].index[matrix[col]==True]))

wac-wordreactmultilineribbon-
wac-wordstatefulaugloop-
wac-wordreplicationintervalrealtimecoauthexperiment-
wac-box4augloopnodeeventsfiringinterval-500ms-
wac-wordprefetchideasresources-
wac-wordprefetchofficejsbetaresources-
wac-wordkeepmaxlastknownheight-
wac-wordsetfindreplacepanelocationtoright-
wac-wordideas-
wac-wordinteractiveperfinertialscrollvisibility-
wac-wordmixedjustificationparagraphmultiselectjustify-
wac-box4augloopannotations-
wac-ocpsisenabled-
wac-wordaugloopoperations-
wacdash-
wac-wordaugloopserversideseeding-
wac-wordmergedparagraphwordidmappingimprovements-
wac-wordviewerdullscript-
wac-box4augloopannotationsattaching-


In [146]:
linear_dependents

[['wac-wordreactmultilineribbon-'],
 ['wac-wordstatefulaugloop-',
  'wac-box4augloopannotations-',
  'wac-wordaugloopoperations-',
  'wac-wordaugloopserversideseeding-',
  'wac-box4augloopannotationsattaching-'],
 ['wac-wordreplicationintervalrealtimecoauthexperiment-'],
 ['wac-box4augloopnodeeventsfiringinterval-500ms-'],
 ['wac-wordprefetchideasresources-', 'wac-wordprefetchofficejsbetaresources-'],
 ['wac-wordprefetchideasresources-', 'wac-wordprefetchofficejsbetaresources-'],
 ['wac-wordkeepmaxlastknownheight-'],
 ['wac-wordsetfindreplacepanelocationtoright-'],
 ['wac-wordideas-'],
 ['wac-wordinteractiveperfinertialscrollvisibility-'],
 ['wac-wordmixedjustificationparagraphmultiselectjustify-'],
 ['wac-wordstatefulaugloop-',
  'wac-box4augloopannotations-',
  'wac-wordaugloopoperations-',
  'wac-wordaugloopserversideseeding-',
  'wac-box4augloopannotationsattaching-'],
 ['wac-ocpsisenabled-'],
 ['wac-wordstatefulaugloop-',
  'wac-box4augloopannotations-',
  'wac-wordaugloopoperatio

In [385]:
cols = list(set(cols))

logit = sm.Logit(y['word'],sm.add_constant(X['word'].loc[:,cols].T.drop_duplicates().T))
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients['FinalProbability'] = np.exp(coefficients['Coef.'].round(1))*y['word'].value_counts(normalize=True).loc[1] #- 0.5
coefficients['FinalProbability'] = coefficients['FinalProbability'].mask(coefficients['FinalProbability']>=1,0.99)
coefficients['Lift'] = coefficients['FinalProbability'] - y['word'].value_counts(normalize=True).loc[1]
word_coefficients = coefficients.join(flight_durations)
word_coefficients['Type'] = 'Word'

Optimization terminated successfully.
         Current function value: 0.601910
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                34222
Model:                          Logit   Df Residuals:                    34207
Method:                           MLE   Df Model:                           14
Date:                Fri, 10 Jan 2020   Pseudo R-squ.:                 0.01745
Time:                        12:01:15   Log-Likelihood:                -20599.
converged:                       True   LL-Null:                       -20964.
Covariance Type:            nonrobust   LLR p-value:                4.894e-147
                                                           coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------------
const                   

In [None]:
y['word'].value_counts(normalize=True)

## PowerPoint

In [388]:
print('PowerPoint AllUp MODEL')
logit = sm.Logit(y['pp'],sm.add_constant(X['pp']))

PowerPoint AllUp MODEL


In [389]:
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients['FinalProbability'] = np.exp(coefficients['Coef.'].round(1))*y['pp'].value_counts(normalize=True).loc[1] #- 0.5
coefficients['FinalProbability'] = coefficients['FinalProbability'].mask(coefficients['FinalProbability']>=1,0.99)
coefficients['Lift'] = coefficients['FinalProbability'] - y['pp'].value_counts(normalize=True).loc[1]
pp_coefficients = coefficients.join(flight_durations)
pp_coefficients['Type'] = 'PowerPoint'

Optimization terminated successfully.
         Current function value: 0.561971
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                24430
Model:                          Logit   Df Residuals:                    24409
Method:                           MLE   Df Model:                           20
Date:                Fri, 10 Jan 2020   Pseudo R-squ.:                0.008460
Time:                        12:01:44   Log-Likelihood:                -13729.
converged:                       True   LL-Null:                       -13846.
Covariance Type:            nonrobust   LLR p-value:                 1.668e-38
                                                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------
const                                       

In [178]:
y['pp'].value_counts(normalize=True)

1    0.745927
0    0.254073
Name: NPS, dtype: float64

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X['pp'], y['pp'], test_size=0.1)
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    #'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None]}
    #'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.5min finished


{'max_depth': 5, 'n_estimators': 500} -0.560280774844516


## Overall

Flights that appear in each of the three apps

In [390]:
logit = sm.Logit(y['all'],X['all'])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients['FinalProbability'] = np.exp(coefficients['Coef.'].round(1))*y['all'].value_counts(normalize=True).loc[1] #- 0.5
coefficients['FinalProbability'] = coefficients['FinalProbability'].mask(coefficients['FinalProbability']>=1,0.99)
coefficients['Lift'] = coefficients['FinalProbability'] - y['all'].value_counts(normalize=True).loc[1]
all_coefficients = coefficients.join(flight_durations)
all_coefficients['Type'] = 'All'

Optimization terminated successfully.
         Current function value: 0.606848
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                90178
Model:                          Logit   Df Residuals:                    90141
Method:                           MLE   Df Model:                           36
Date:                Fri, 10 Jan 2020   Pseudo R-squ.:                -0.01161
Time:                        12:01:54   Log-Likelihood:                -54724.
converged:                       True   LL-Null:                       -54097.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                                                                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------------------
wac-minimi

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [177]:
y['all'].value_counts(normalize=True)

1    0.712513
0    0.287487
Name: NPS, dtype: float64

## Scorecards

const                                                                NaN
xls-bidataplatformforcharte2o-                                     461.0
xls-slrcachegalleries-                                           22346.0
xls-tabletextcontrastacccheck-                                    5644.0
xls-slrcachecomboboxmenus-                                       26518.0
xls-ideas-                                                       21947.0
xls-freemiumupsellheaderui-                                       1882.0
xls-canvasclientsideestimation-                                   3517.0
xls-getrevisiondontsendknownblobs-                               10419.0
xls-ocpsisenabled-                                               12282.0
xls-linkedentityblankcellconvertteachingcalloutvideowithtext-     2502.0
xls-sharedcommentsfluentui-                                       8505.0
dtype: float64

In [391]:
# excel_probability = y['excel'].value_counts(normalize=True)
# word_probability = y['word'].value_acounts(normalize=True)
# pp_probability = y['all'].value_counts(normalize=True)
# all_probability = y['all'].value_counts(normalize=True)

scorecard = pd.concat([excel_coefficients, word_coefficients,pp_coefficients,all_coefficients])

In [392]:
excel_coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FinalProbability,Lift,FlightStart,FlightEnd,Type
const,1.043194,0.045144,23.108056,3.8423450000000003e-118,0.954713,1.131675,2.838268,2.597925,3.100845,0.99,0.287564,NaT,NaT,Excel
xls-bidataplatformforcharte2o-,0.327353,0.114404,2.861365,0.00421821,0.103124,0.551581,1.387291,1.108629,1.735996,0.94819,0.245753,2019-10-14 20:55:10,2019-11-29 23:35:07,Excel
xls-slrcachegalleries-,0.169635,0.046318,3.662376,0.0002498867,0.078853,0.260418,1.184873,1.082045,1.297472,0.857957,0.155521,2019-05-31 18:09:09,2019-11-29 23:49:35,Excel
xls-tabletextcontrastacccheck-,-0.441253,0.072813,-6.060077,1.360567e-09,-0.583964,-0.298542,0.64323,0.557683,0.741899,0.470857,-0.231579,2019-05-01 02:10:10,2019-07-15 18:10:07,Excel
xls-slrcachecomboboxmenus-,0.201834,0.054031,3.735517,0.00018733,0.095935,0.307733,1.223645,1.100687,1.360337,0.857957,0.155521,2019-05-01 16:00:05,2019-11-29 23:49:35,Excel
xls-ideas-,-0.23013,0.03932,-5.852825,4.832923e-09,-0.307195,-0.153065,0.79443,0.735507,0.858074,0.575106,-0.12733,2019-05-01 09:20:20,2019-11-29 23:49:35,Excel
xls-freemiumupsellheaderui-,0.457695,0.064927,7.049376,1.797217e-12,0.330441,0.58495,1.580427,1.391581,1.794901,0.99,0.287564,2019-05-01 01:34:07,2019-11-29 23:49:35,Excel
xls-canvasclientsideestimation-,-0.112535,0.044807,-2.511518,0.01202132,-0.200356,-0.024714,0.893566,0.81844,0.975589,0.63559,-0.066846,2019-09-12 11:14:09,2019-11-11 13:09:14,Excel
xls-getrevisiondontsendknownblobs-,-0.096755,0.04073,-2.375508,0.01752482,-0.176584,-0.016925,0.907779,0.838128,0.983217,0.63559,-0.066846,2019-09-15 08:25:25,2019-11-29 23:49:35,Excel
xls-ocpsisenabled-,-0.292035,0.157896,-1.849541,0.06437977,-0.601504,0.017435,0.746743,0.547987,1.017588,0.520377,-0.182059,2019-10-07 19:09:17,2019-11-29 23:49:35,Excel


In [397]:
X['excel'].sum()[excel_coefficients.index]

const                                                                NaN
xls-bidataplatformforcharte2o-                                     461.0
xls-slrcachegalleries-                                           22346.0
xls-tabletextcontrastacccheck-                                    5644.0
xls-slrcachecomboboxmenus-                                       26518.0
xls-ideas-                                                       21947.0
xls-freemiumupsellheaderui-                                       1882.0
xls-canvasclientsideestimation-                                   3517.0
xls-getrevisiondontsendknownblobs-                               10419.0
xls-ocpsisenabled-                                               12282.0
xls-linkedentityblankcellconvertteachingcalloutvideowithtext-     2502.0
xls-sharedcommentsfluentui-                                       8505.0
dtype: float64

In [393]:
word_coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FinalProbability,Lift,FlightStart,FlightEnd,Type
const,0.997427,0.031611,31.553071,1.6273060000000001e-218,0.935471,1.059384,2.711298,2.548413,2.884593,0.99,0.292057,NaT,NaT,Word
wac-wordreactmultilineribbon-,0.862211,0.275109,3.134067,0.001724012,0.323007,1.401415,2.368391,1.381275,4.060941,0.99,0.292057,2019-11-20 21:45:28,2019-11-29 22:45:10,Word
wac-wordprefetchideasresources-,-0.199471,0.108374,-1.840583,0.06568266,-0.411881,0.012938,0.819164,0.662403,1.013022,0.571427,-0.126516,2019-10-03 22:55:09,2019-11-29 23:55:09,Word
wac-wordmultitouchgestureword-,0.744261,0.248369,2.996594,0.00273014,0.257467,1.231056,2.104886,1.293649,3.424843,0.99,0.292057,2019-11-20 02:45:10,2019-11-29 22:45:10,Word
wac-wordideas-,0.541939,0.107603,5.036483,4.74164e-07,0.331042,0.752836,1.719337,1.392418,2.123013,0.99,0.292057,2019-07-14 01:45:06,2019-11-29 23:55:09,Word
wac-wordupdatecommanduiperfimprovement-,0.19004,0.03471,5.475019,4.374637e-08,0.122009,0.258072,1.209298,1.129764,1.294431,0.852469,0.154526,2019-05-16 00:19:19,2019-09-27 22:24:20,Word
wac-ocpsisenabled-,-0.125275,0.061912,-2.023428,0.04302903,-0.246621,-0.003929,0.882254,0.781437,0.996078,0.631525,-0.066418,2019-09-17 18:55:16,2019-11-29 23:55:09,Word
wac-wordmergedparagraphwordidmappingimprovements-,0.483389,0.118505,4.079066,4.521693e-05,0.251124,0.715654,1.62156,1.285469,2.045524,0.99,0.292057,2019-05-16 00:39:21,2019-06-12 02:39:11,Word
wac-wordwordcountforlistmarkerfix-,0.36678,0.181476,2.021088,0.04327069,0.011093,0.722467,1.44308,1.011154,2.059507,0.99,0.292057,2019-11-14 10:10:02,2019-11-29 22:35:06,Word


In [398]:
X['word'].sum()[word_coefficients.index]

const                                                    NaN
wac-wordreactmultilineribbon-                          104.0
wac-wordprefetchideasresources-                       7032.0
wac-wordmultitouchgestureword-                         117.0
wac-wordideas-                                        7531.0
wac-wordupdatecommanduiperfimprovement-              11572.0
wac-ocpsisenabled-                                   18065.0
wac-wordmergedparagraphwordidmappingimprovements-      501.0
wac-wordwordcountforlistmarkerfix-                     178.0
dtype: float64

In [394]:
pp_coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FinalProbability,Lift,FlightStart,FlightEnd,Type
const,1.069978,0.043462,24.618493,8.007448999999999e-134,0.984793,1.155162,2.915314,2.677257,3.174538,0.99,0.244073,NaT,NaT,PowerPoint
ppt-designerdashboardtest-,-0.098038,0.055601,-1.763244,0.07785929,-0.207014,0.010938,0.906614,0.813008,1.010998,0.674943,-0.070984,2019-05-01 00:04:11,2019-08-08 21:10:18,PowerPoint
ppt-formatpainterapplyremapped-,0.107568,0.049904,2.155492,0.03112339,0.009758,0.205378,1.113566,1.009805,1.227989,0.824377,0.07845,2019-07-04 02:04:08,2019-09-11 02:49:10,PowerPoint
ppt-freemiumupsellheaderui-,0.175385,0.079512,2.205754,0.02740126,0.019543,0.331226,1.191705,1.019736,1.392675,0.911077,0.16515,2019-05-01 01:34:07,2019-11-29 23:49:35,PowerPoint
ppt-helptabwhatsnewenabled-,0.230437,0.065924,3.495521,0.0004731374,0.101229,0.359645,1.259151,1.10653,1.432821,0.911077,0.16515,2019-05-31 04:59:17,2019-11-26 14:50:37,PowerPoint
ppt-pasteslidekeepsourceformattingdefault-,-0.141894,0.055127,-2.573968,0.01005394,-0.24994,-0.033848,0.867713,0.778847,0.966718,0.674943,-0.070984,2019-09-17 17:55:11,2019-11-29 23:19:24,PowerPoint
ppt-pptsharedcommentsfluentui-,-0.107516,0.041603,-2.584299,0.009757728,-0.189057,-0.025974,0.898062,0.827739,0.97436,0.674943,-0.070984,2019-08-21 14:34:14,2019-11-14 10:26:02,PowerPoint
ppt-slideeditforaugmentationloop-,-0.129785,0.078598,-1.651248,0.09868805,-0.283834,0.024265,0.878284,0.752892,1.024561,0.674943,-0.070984,2019-11-14 12:10:05,2019-11-29 23:14:06,PowerPoint
ppt-wopitokenrefresh-,0.23031,0.059765,3.853586,0.0001164005,0.113173,0.347448,1.258991,1.119825,1.415451,0.911077,0.16515,2019-05-01 00:04:11,2019-11-29 23:19:24,PowerPoint


In [399]:
X['pp'].sum()[pp_coefficients.index]

const                                            NaN
ppt-designerdashboardtest-                    2993.0
ppt-formatpainterapplyremapped-               3948.0
ppt-freemiumupsellheaderui-                   1349.0
ppt-helptabwhatsnewenabled-                   6577.0
ppt-pasteslidekeepsourceformattingdefault-    7919.0
ppt-pptsharedcommentsfluentui-                7356.0
ppt-slideeditforaugmentationloop-             1932.0
ppt-wopitokenrefresh-                         2722.0
dtype: float64

In [131]:
all_coefficients

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FinalProbability,FlightStart,FlightEnd,Type
wac-minimizeintelligentplaceholderwork-,0.189765,0.071485,2.654632,0.007939498,0.049658,0.329872,1.208966,1.050912,1.39079,0.6,2019-09-03 23:49:07,2019-11-29 23:49:35,All
wac-mergemissingeopfix-,0.37864,0.104666,3.617608,0.000297338,0.173499,0.583781,1.460297,1.189459,1.792804,0.75,2019-09-17 00:04:10,2019-10-01 17:50:08,All
wac-worduseaadforfileupload-,0.396845,0.032771,12.109807,9.371411e-34,0.332616,0.461074,1.487125,1.394612,1.585776,0.75,2019-07-10 18:59:06,2019-09-10 20:10:08,All
firstrelease-,0.299517,0.024576,12.187616,3.618516e-34,0.25135,0.347685,1.349208,1.28576,1.415786,0.65,2019-05-01 00:04:11,2019-11-29 23:49:35,All
wac-wordkeepmaxlastknownheight-,0.264643,0.041748,6.339033,2.312111e-10,0.182818,0.346468,1.302966,1.200596,1.414065,0.65,2019-09-19 18:34:06,2019-11-29 23:55:09,All
xls-overridebrowsershortcuts-,0.29976,0.026566,11.283557,1.582175e-29,0.247691,0.351828,1.349535,1.281064,1.421664,0.65,2019-08-25 10:09:31,2019-11-29 23:49:35,All
docowneridtestaa-,0.192869,0.062046,3.1085,0.001880395,0.071262,0.314476,1.212724,1.073862,1.369542,0.6,2019-05-01 00:04:11,2019-07-15 18:10:07,All
wac-wordclearformattingtrackchanges-,-0.103735,0.051618,-2.009685,0.04446456,-0.204904,-0.002566,0.901464,0.814725,0.997437,0.45,2019-06-28 15:49:09,2019-11-29 23:04:09,All
wac-wordeditorcashdashisactivecoauth-,-0.246928,0.040389,-6.113753,9.731512e-10,-0.326089,-0.167767,0.781197,0.721741,0.845551,0.4,2019-09-24 18:20:12,2019-11-29 23:55:09,All
wac-wordsdxpreloadpackagelist-,0.156556,0.024586,6.367778,1.917857e-10,0.108369,0.204744,1.169477,1.114459,1.22721,0.6,2019-09-09 18:09:56,2019-11-11 13:09:14,All


In [322]:
scorecard.drop('const')

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FinalProbability,DetractorLift,FlightStart,FlightEnd,Type
xls-slrcachegalleries-,0.245994,0.042502,5.78788,7.128035e-09,0.162692,0.329295,1.278892,1.176675,1.389988,0.99,0.692436,2019-05-31 18:09:09,2019-11-29 23:49:35,Excel
xls-reconnectsessiononuserinteration-,-0.696395,0.210108,-3.31446,0.0009182046,-1.1082,-0.284591,0.498379,0.330153,0.752322,0.4906,0.193036,2019-08-04 12:29:06,2019-08-15 10:15:38,Excel
xls-tabletextcontrastacccheck-,-0.451085,0.073028,-6.17686,6.538915e-10,-0.594218,-0.307952,0.636937,0.551994,0.73495,0.542197,0.244633,2019-05-01 02:10:10,2019-07-15 18:10:07,Excel
xls-ideas-,-0.193799,0.038094,-5.087436,3.629375e-07,-0.268462,-0.119137,0.823823,0.764555,0.887686,0.662241,0.364677,2019-05-01 09:20:20,2019-11-29 23:49:35,Excel
xls-freemiumupsellheaderui-,0.438097,0.064732,6.76781,1.307467e-11,0.311224,0.56497,1.549755,1.365094,1.759395,0.99,0.692436,2019-05-01 01:34:07,2019-11-29 23:49:35,Excel
xls-chartfloatingobjectcontrolfallback-,0.177263,0.088836,1.995391,0.04600024,0.003147,0.35138,1.193946,1.003152,1.421027,0.987947,0.690383,2019-06-04 12:25:32,2019-11-29 23:49:35,Excel
xls-canvasclientsideestimation-,-0.122344,0.045088,-2.71344,0.006658859,-0.210716,-0.033973,0.884844,0.810004,0.966598,0.731889,0.434325,2019-09-12 11:14:09,2019-11-11 13:09:14,Excel
xls-getrevisiondontsendknownblobs-,-0.107401,0.040918,-2.62476,0.008671017,-0.187599,-0.027202,0.898166,0.828947,0.973164,0.731889,0.434325,2019-09-15 08:25:25,2019-11-29 23:49:35,Excel
xls-ocpsisenabled-,-0.296517,0.158104,-1.875457,0.06072982,-0.606395,0.013361,0.743403,0.545313,1.01345,0.59922,0.301656,2019-10-07 19:09:17,2019-11-29 23:49:35,Excel
xls-earlysessionstartforview-,-0.098699,0.055179,-1.788689,0.07366499,-0.206848,0.009451,0.906016,0.813143,1.009496,0.731889,0.434325,2019-07-07 12:25:22,2019-11-29 23:44:18,Excel
