Objective: Expand NPS Predictive Model based on Flight Control/Treatment pairs to newer datasets and also to more apps than just Excel.

In [1]:
import math
import pandas as pd
import numpy as np
import datetime
import sklearn.tree
from graphviz import Source
from IPython.display import SVG
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
import os

from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

def transform_rating(rating):
    '''Input: Column of Data with NPS Field (on a scale of 1 to 5...)
    Output: Column with either Promoter/Detractor/Passive label, or the corresponding weights, based on datatype'''
    
    if rating == 5:
        return 100
    elif rating <= 3:
        return -100
    elif rating == 4:
        return 0
    else:
        return np.nan

In [2]:
Word_df = pd.read_csv('Word_updated_data.tsv', sep='\t')

Excel_df = pd.read_csv('Excel_updated_data.tsv', sep='\t')
PP_df = pd.read_csv('PowerPoint_updated_data.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
def make_AB_dataset(df,prefixes):
    '''df: initial input dataframe of flight data
    prefixes: flights have prefixes - this arg is a list for which prefixes we want to filter on'''
    df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    df['FlightPair'].replace('docowner-canary','canary-docowner',inplace=True)
    
    
    ab_df = df[df.FlightId.notnull()]
    ab_df.drop_duplicates(keep='last', inplace=True)
    print(ab_df.shape, ' before filtering out non-pairs')
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    print(ab_df.shape, ' after filtering out non-pairs')
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Group'] = 'Control'
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Group'] = 'Treatment'
    ab_df.loc[ab_df.FlightId.str.endswith('-c'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('control'),'Flight'] = 0
    ab_df.loc[ab_df.FlightId.str.endswith('-t'),'Flight'] = 1
    ab_df.loc[ab_df.FlightId.str.endswith('treatment'),'Flight'] = 1
    ab_df = ab_df[ab_df.Flight.notnull()]
    ab_df['NPS'] = ab_df['Rating'].apply(transform_rating)
    ab_df = ab_df.sort_values(by='Date')
    value_key = ab_df.sort_values(by='Date').groupby(['OcvId'])['NPS'].last().to_dict()
    exp_df = ab_df.groupby(['OcvId','FlightPair'])['Flight'].last().unstack()
    print('Feature Matrix should have ',ab_df.OcvId.nunique(), ' rows and ',ab_df.FlightPair.nunique(),' columns')
    print('Final Shape:',exp_df.shape)
    if prefixes: #i.e. if the input list is empty:
        for p in prefixes:
            exp_df = exp_df.iloc[:,exp_df.columns.str.startswith(p)]
    exp_df['NPS'] = exp_df.index.map(value_key)
    return exp_df.fillna(0)

#excel_df = make_AB_dataset(Excel_df,['xls'])

In [3]:
def get_flight_durations():
    ''' no inputs/arguments, just make sure you have all of the Tabular Flight data files you are using.'''
    df = pd.concat([Excel_df[Excel_df['AudienceGroup']=='Production'][Excel_df.FlightId.notnull()],
                    Word_df[Word_df['AudienceGroup']=='Production'][Word_df.FlightId.notnull()],
                    PP_df[PP_df['AudienceGroup']=='Production'][PP_df.FlightId.notnull()]])
    #df = df[df['AudienceGroup']=='Production']
    df['Date'] = pd.to_datetime(df['Date'])
    flights = df.FlightId.astype(str).unique()
    controls = [f for f in flights if f.endswith('-c') or f.__contains__('control')]
    treatments = [f for f in flights if f.endswith('-t') or f.__contains__('treatment')]
    neither = [f for f in flights if (f not in controls) and (f not in treatments)]
    union = {'controls':[c.rstrip('control') for c in controls],'treatments':[t.rstrip('treatments') for t in treatments]}
    no_control = list(set(union['treatments'])-set(union['controls']))
    control_treatment_pairs = list(set(union['treatments'])-set(no_control))
    df['FlightPair']=df['FlightId'].astype(str).map(lambda x: x.rstrip('control'))
    df['FlightPair']=df['FlightPair'].map(lambda x: x.rstrip('treatment'))
    
    print('Flight Pairs Assigned')
    
    ab_df = df[df.FlightPair.notnull()]
    #ab_df.drop_duplicates(keep='last', inplace=True)
    ab_df = ab_df[ab_df['FlightPair'].isin(control_treatment_pairs)]
    flight_starts = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].first()
    flight_ends = ab_df.sort_values(by='Date').groupby(['FlightPair'])['Date'].last()
    flight_durations = pd.concat([flight_starts,flight_ends],axis=1)
    flight_durations.columns = ['FlightStart','FlightEnd']
    return flight_durations
    
flight_durations = get_flight_durations()

Flight Pairs Assigned


In [None]:
flight_durations.to_csv('FlightsSeptember.csv')

In [10]:
excel_df = make_AB_dataset(Excel_df,[])
word_df = make_AB_dataset(Word_df,[])
pp_df = make_AB_dataset(PP_df,[])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(559722, 40)  before filtering out non-pairs
(367873, 40)  after filtering out non-pairs
Feature Matrix should have  16624  rows and  119  columns
Final Shape: (16624, 119)
(980393, 40)  before filtering out non-pairs
(637576, 40)  after filtering out non-pairs
Feature Matrix should have  17466  rows and  193  columns
Final Shape: (17466, 193)
(275253, 40)  before filtering out non-pairs
(140480, 40)  after filtering out non-pairs
Feature Matrix should have  12963  rows and  48  columns
Final Shape: (12963, 48)


In [66]:
excel_flights = list(excel_df.columns)
word_flights = list(word_df.columns)
common_flights = list(set(excel_flights).intersection(word_flights))
pp_flights = list(pp_df.columns)
common_flights = list(set(common_flights).intersection(pp_flights))

In [69]:
common_flights.remove('NPS')

In [70]:
common_flights

['docowner-canary-',
 'wac-wordclearformattingtrackchanges-',
 'wac-box4directprinthidesensitiveparameters-',
 'xls-overridebrowsershortcuts-',
 'wac-wordeditorcashdashisactivecoauth-',
 'wac-wordshowpasteoptionsincontextmenu-',
 'canary-docowner-',
 'canary2-',
 'wac-worduseaadforfileupload-',
 'wac-worddeprioritizerichtextcontentcontrolacetateforcoauth-',
 'wacdash-',
 'wac-wordsdxpreloadpackagelist-',
 'firstrelease-',
 'wac-wordpreserveverbonredirect-',
 'wac-wordwordoauthtestappmsatokenretrieval-',
 'xls-removescrolltofirstandlastsheettabnavbuttons-',
 'docowneridtestaa-',
 'wac-owaunifiedapp-',
 'useridtestaa-',
 'canary-',
 'wac-mergemissingeopfix-',
 'xls-customsortforallrangetypes-',
 'wac-minimizeintelligentplaceholderwork-',
 'wac-wordwordoauthtestappaadtokenretrieval-',
 'wac-wordkeepmaxlastknownheight-']

In [1]:
dc = ['docowner-canary-','canary-docowner-',
     'canary2','canary-']

NameError: name 'common_flights' is not defined

In [82]:
X = {'excel':excel_df.iloc[:,excel_df.columns.str.startswith('xls')],
     'word':word_df.iloc[:,word_df.columns.str.startswith('wac')],
     'pp':pp_df.iloc[:,pp_df.columns.str.startswith('pp')],
     'all':pd.concat([excel_df,word_df,pp_df]).loc[:,common_flights]}
y = {'excel':excel_df['NPS'].replace([100,0],1).replace(-100,0)
     ,'word':word_df['NPS'].replace([100,0],1).replace(-100,0),
     'pp':pp_df['NPS'].replace([100,0],1).replace(-100,0),
     'all':pd.concat([excel_df,word_df,pp_df])['NPS'].replace([100,0],1).replace(-100,0)}

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [84]:
print(X['excel'].shape, y['excel'].shape)
print(X['word'].shape, y['word'].shape)
print(X['pp'].shape, y['pp'].shape)
print(X['all'].shape, y['all'].shape)

(16624, 90) (16624,)
(17466, 177) (17466,)
(12963, 14) (12963,)
(47053, 25) (47053,)


In [106]:
model_perfs = pd.DataFrame(columns=['Logistic','RandomForest','GradientBoosting'])
for key in ['excel','word','pp','all']:
################################################ WHEN PREDICTORS ARE BINARY (1 for Treatment, 0 otherwise)
    logit = LogisticRegression().fit(X[key],y[key])
    rf = RandomForestClassifier(random_state=0).fit(X[key],y[key])
    gb = xgb.XGBClassifier(random_state=0, n_jobs=4).fit(X[key],y[key])
    
    log_acc = round(logit.score(X[key],y[key]),3)
    rf_acc = round(rf.score(X[key],y[key]),3)
    gb_acc = round(gb.score(X[key],y[key]),3)
    
    log_acc = round(logit.score(X[key],y[key]),3)
    rf_acc = round(rf.score(X[key],y[key]),3)
    gb_acc = round(gb.score(X[key],y[key]),3)


    model_perfs.loc[key+'Accuracy']=[log_acc,rf_acc,gb_acc]



In [107]:
model_perfs

Unnamed: 0,Logistic,RandomForest,GradientBoosting
excelAccuracy,0.743,0.836,0.743
wordAccuracy,0.75,0.962,0.75
ppAccuracy,0.77,0.771,0.77
allAccuracy,0.753,0.762,0.753


## Excel

In [218]:
X_train, X_test, y_train, y_test = train_test_split(X['excel'], y['excel'], test_size=0.1)

In [112]:
### ONLY RUN IF THERE IS NO OUTPUT ###
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

{'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'oob_score': True} -0.559152758912611


In [219]:
print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))


rfc = RandomForestClassifier(random_state = 0, n_estimators = 100,
                      max_depth=5,oob_score=True)
rfc.fit(X_train,y_train)
print(rfc.score(X_test,y_test))

learners = rfc.feature_importances_.argsort()[::-1]

features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.025]
features



0.7017438364401684
0.7432351172579675


Unnamed: 0,FlightPair
0.106023,xls-tabletextcontrastacccheck-
0.069156,xls-newcommentbuttoninpane-
0.049805,xls-slrcachecomboboxmenus-
0.045594,xls-slrcachegalleries-
0.044594,xls-namedsheetviewspassivetest-
0.03908,xls-ideas-
0.03694,xls-satoriforthirdpartiesenabled-
0.035142,xls-listfieldlookupoptimized-
0.030356,xls-workbookcachedigestreset-
0.029938,xls-helptabwhatsnewenabled-


In [221]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.7414311485267588  -->...
Score with Top  1  Features:
0.7432351172579675
Score with Top  2  Features:
0.7432351172579675
Score with Top  3  Features:
0.7432351172579675
Score with Top  4  Features:
0.7432351172579675
Score with Top  5  Features:
0.7432351172579675
Score with Top  10  Features:
0.7432351172579675
Score with Top  15  Features:
0.7432351172579675
Score with Top  20  Features:
0.7432351172579675
Score with Top  25  Features:
0.7432351172579675
Score with Top  50  Features:
0.7432351172579675
Score with Top  75  Features:
0.7420324714371618
Score with Top  100  Features:
0.7414311485267588


In [222]:
print(logit.fit(X_train,y_train).score(X_test,y_test), ' -->...')
logit_rfe.score(test_rfe_X,test_rfe_y)

0.7414311485267588  -->...


0.7414311485267588

In [225]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,5)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


In [226]:
cols.extend(features.FlightPair.values.tolist())
cols = list(set(cols))

logit = sm.Logit(y['excel'],X['excel'].loc[:,cols])
flogit = logit.fit()
print(flogit.summary())

coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

         Current function value: 0.569723
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                16624
Model:                          Logit   Df Residuals:                    16609
Method:                           MLE   Df Model:                           14
Date:                Thu, 17 Oct 2019   Pseudo R-squ.:               0.0008208
Time:                        15:17:15   Log-Likelihood:                -9471.1
converged:                      False   LL-Null:                       -9478.9
                                        LLR p-value:                    0.3409
                                                        coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------
xls-slrcachegalleries-                                0.2894      



Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
xls-slrcachegalleries-,0.289394,0.054687,5.29178,1.211313e-07,0.182209,0.39658,1.335618,1.199865,1.486731,2019-05-31 18:09:09,2019-09-29 23:55:07
xls-tabletextcontrastacccheck-,0.297222,0.070213,4.233168,2.304221e-05,0.159608,0.434836,1.346114,1.173051,1.54471,2019-05-01 02:10:10,2019-07-15 18:10:07
xls-reconnectsessiononuserinteration-,-0.692824,0.209844,-3.301622,0.0009612758,-1.10411,-0.281538,0.500161,0.331506,0.754622,2019-08-04 12:29:06,2019-08-15 10:15:38
xls-freemiumupsellheaderui-,0.587166,0.124295,4.723966,2.312895e-06,0.343552,0.83078,1.798883,1.409947,2.295108,2019-05-01 01:34:07,2019-09-29 23:55:04
xls-slrcachecomboboxmenus-,0.347177,0.06481,5.356834,8.469274e-08,0.220151,0.474202,1.415067,1.246265,1.606731,2019-05-01 16:00:05,2019-09-29 23:55:07
xls-namedsheetviewspassivetest-,0.177923,0.057159,3.112784,0.001853317,0.065894,0.289953,1.194734,1.068113,1.336365,2019-05-02 14:35:04,2019-09-29 23:55:07
xls-chartfloatingobjectcontrolfallback-,0.153802,0.070488,2.181979,0.02911112,0.015649,0.291956,1.16626,1.015772,1.339044,2019-06-04 12:25:32,2019-09-29 23:55:07
xls-listfieldlookupoptimized-,0.216439,0.065221,3.318559,0.0009048324,0.088609,0.344269,1.241647,1.092653,1.410959,2019-05-22 22:49:04,2019-09-29 23:55:07
xls-ideas-,0.145126,0.057737,2.513564,0.0119518,0.031963,0.258289,1.156185,1.03248,1.294713,2019-05-01 09:20:20,2019-09-29 23:55:07
xls-helptabwhatsnewenabled-,0.217469,0.070019,3.10585,0.001897328,0.080234,0.354704,1.242927,1.083541,1.425758,2019-05-21 19:29:36,2019-09-29 23:55:07


## Word

In [228]:
X_train, X_test, y_train, y_test = train_test_split(X['word'], y['word'], test_size=0.1)

In [201]:
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

print(RandomForestClassifier(random_state=0).fit(X_train,
                                           y_train).score(X_test,y_test))
rfc = RandomForestClassifier(random_state = 0, n_estimators = 500,
                      max_depth=5,oob_score=True)

rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  2

{'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'oob_score': True} -0.5600213224138181




0.6994848311390955


0.7584430452203778

In [235]:
learners = rfc.feature_importances_.argsort()[::-1]
features = pd.DataFrame(X_train.columns[learners], rfc.feature_importances_[learners])
features = features[features.index>0.025]
features

Unnamed: 0,FlightPair
0.106023,wac-wordideasexperience-
0.069156,wac-wordcopypasteparagraphcharacterstyleid-
0.049805,wac-wordhometabownoverflow-
0.045594,wac-wordideas-
0.044594,wac-wordconditionalinsideborderfix-
0.03908,wac-wordappnamerebrand-
0.03694,wac-wordeditortableofcontentcreateanddelete-
0.035142,wac-wordbrowsershowserrorwhenwaitingonlongboot-
0.030356,wac-wordinteractiveperfoverlaycontrolrenderopt...
0.029938,wac-wordallowcontenteditabledragoutsidewindow-


In [229]:
print(LogisticRegression().fit(X_train,y_train).score(X_test,y_test), ' -->...')

for i in [1,2,3,4,5,10,15,20,25,50,75,100]:
    print('Score with Top ',i,' Features:')
    logit = LogisticRegression(solver='liblinear')

    rfe = RFE(logit,i)
    rfe = rfe.fit(X_train,y_train.values.ravel())

    rfe.support_
    rfe.ranking_

    #identified columns Recursive Feature Elimination
    idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                           "columns" : [i for i in X_train.columns],
                           "ranking" : rfe.ranking_,
                          })
    cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


    #separating train and test data
    train_rfe_X = X_train[cols]
    train_rfe_y = y_train.copy()

    test_rfe_X = X_test[cols]
    test_rfe_y = y_test.copy()


    logit_rfe = LogisticRegression(class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, #n_jobs=4,
              penalty='l2', random_state=None, solver='liblinear', #tol=0.0001,
              verbose=0, warm_start=False)

    logit_rfe.fit(train_rfe_X, train_rfe_y)
    print(logit_rfe.score(test_rfe_X,test_rfe_y))



0.7561534058385804  -->...
Score with Top  1  Features:
0.7561534058385804
Score with Top  2  Features:
0.7561534058385804
Score with Top  3  Features:
0.7561534058385804
Score with Top  4  Features:
0.7561534058385804
Score with Top  5  Features:
0.7561534058385804
Score with Top  10  Features:
0.7561534058385804
Score with Top  15  Features:
0.7561534058385804
Score with Top  20  Features:
0.7561534058385804
Score with Top  25  Features:
0.7561534058385804
Score with Top  50  Features:
0.7555809959931311
Score with Top  75  Features:
0.7555809959931311
Score with Top  100  Features:
0.7561534058385804


In [251]:
logit = LogisticRegression(solver='liblinear')

rfe = RFE(logit,10)
rfe = rfe.fit(X_train,y_train.values.ravel())

rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in X_train.columns],
                       "ranking" : rfe.ranking_,
                      })

In [252]:
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()

In [253]:
cols.extend(features.FlightPair.values.tolist())

In [254]:
cols

['wac-wordclearformattingtrackchanges-',
 'wac-wordcopypasteparagraphcharacterstyleid-',
 'wac-wordeditorinlinetabs-',
 'wac-wordeditorinlinetabsfeaturecomposition-',
 'wac-wordeditorserviceverificationloopab-',
 'wac-wordhandlemixedlistparagraphmultiselect-',
 'wac-wordinteractiveperflimitproofingresume-',
 'wac-wordinteractiveperfscrollimprovement-',
 'wac-wordlogtasksdetailsfromcpumeter-',
 'wac-wordmergedparagraphwordidmappingimprovements-',
 'wac-wordideasexperience-',
 'wac-wordcopypasteparagraphcharacterstyleid-',
 'wac-wordhometabownoverflow-',
 'wac-wordideas-',
 'wac-wordconditionalinsideborderfix-',
 'wac-wordappnamerebrand-',
 'wac-wordeditortableofcontentcreateanddelete-',
 'wac-wordbrowsershowserrorwhenwaitingonlongboot-',
 'wac-wordinteractiveperfoverlaycontrolrenderoptimization-',
 'wac-wordallowcontenteditabledragoutsidewindow-',
 'wac-fetchkeepaliveonshutdown-']

In [255]:
cols = list(set(cols))

logit = sm.Logit(y['word'],X['word'].loc[:,cols])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

         Current function value: 0.604487
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                17466
Model:                          Logit   Df Residuals:                    17446
Method:                           MLE   Df Model:                           19
Date:                Thu, 17 Oct 2019   Pseudo R-squ.:                -0.07430
Time:                        15:48:12   Log-Likelihood:                -10558.
converged:                      False   LL-Null:                       -9827.8
                                        LLR p-value:                     1.000
                                                               coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------------------
wac-wordclearformattingtrackchanges-                



Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],Odds Ratio,O.R.LB,O.R.UB,FlightStart,FlightEnd
wac-wordhandlemixedlistparagraphmultiselect-,0.71657,0.051891,13.809032,2.248326e-43,0.614865,0.818275,2.047398,1.849406,2.266587,2019-06-04 16:49:49,2019-09-15 10:19:26
wac-wordinteractiveperfoverlaycontrolrenderoptimization-,0.767058,0.069394,11.053717,2.103228e-28,0.631049,0.903067,2.153421,1.879581,2.467158,2019-06-03 22:44:21,2019-07-08 19:14:13
wac-wordmergedparagraphwordidmappingimprovements-,0.887977,0.142771,6.2196,4.98425e-10,0.608152,1.167803,2.430209,1.837033,3.214922,2019-05-16 00:39:21,2019-06-12 02:39:11
wac-wordcopypasteparagraphcharacterstyleid-,1.810919,1.032954,1.753146,0.07957701,-0.213634,3.835472,6.116067,0.807644,46.315296,2019-09-27 20:14:10,2019-09-29 22:29:05
wac-wordconditionalinsideborderfix-,0.222581,0.099852,2.229111,0.02580652,0.026875,0.418286,1.249296,1.027239,1.519356,2019-05-16 00:19:19,2019-07-11 00:00:07
wac-wordbrowsershowserrorwhenwaitingonlongboot-,0.532331,0.162742,3.271017,0.001071614,0.213363,0.851298,1.702896,1.237834,2.342686,2019-05-16 07:20:28,2019-06-11 22:44:05
wac-fetchkeepaliveonshutdown-,0.809075,0.037544,21.549834,5.313769e-103,0.735489,0.88266,2.245829,2.086502,2.417322,2019-07-12 19:30:22,2019-09-17 19:50:21
wac-wordallowcontenteditabledragoutsidewindow-,0.221656,0.079655,2.782713,0.005390648,0.065536,0.377777,1.248142,1.067731,1.459038,2019-05-25 00:39:04,2019-08-08 21:10:18
wac-wordeditortableofcontentcreateanddelete-,0.614802,0.206169,2.982027,0.002863464,0.210718,1.018887,1.849291,1.234564,2.770109,2019-05-16 00:19:19,2019-05-20 21:14:13
wac-wordeditorserviceverificationloopab-,-0.290557,0.115649,-2.512409,0.01199101,-0.517225,-0.06389,0.747847,0.596172,0.938109,2019-05-16 00:19:19,2019-09-18 08:25:50


## PowerPoint

In [209]:
print('PowerPoint AllUp MODEL')
logit = sm.Logit(y['pp'],X['pp'])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

PowerPoint AllUp MODEL
Optimization terminated successfully.
         Current function value: 0.555416
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                12963
Model:                          Logit   Df Residuals:                    12949
Method:                           MLE   Df Model:                           13
Date:                Thu, 17 Oct 2019   Pseudo R-squ.:                -0.03091
Time:                        14:56:02   Log-Likelihood:                -7199.9
converged:                       True   LL-Null:                       -6984.0
                                        LLR p-value:                     1.000
                                                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------
ppt-designerdashboard

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],FlightStart,FlightEnd
ppt-formatpainterapplyremapped-,0.530613,0.047638,11.138549,8.143528e-29,0.437245,0.623981,2019-07-04 02:04:08,2019-09-11 02:49:10
ppt-helptabwhatsnewenabled-,0.475593,0.096992,4.90341,9.418688e-07,0.285492,0.665695,2019-05-31 04:59:17,2019-09-29 22:44:04
ppt-newcommentdraftingapiisenabled-,0.102655,0.049557,2.07144,0.03831765,0.005524,0.199785,2019-05-03 18:29:05,2019-08-22 20:34:08
ppt-newideas-,0.647612,0.036129,17.924898,7.538835e-72,0.5768,0.718424,2019-05-31 02:04:12,2019-09-29 23:54:06
ppt-pptnewcommentbuttoninpane-,0.778448,0.081709,9.527111,1.617205e-21,0.618302,0.938594,2019-05-01 00:04:11,2019-07-18 07:46:00
ppt-pptsharedcommentsfluentui-,0.508543,0.071174,7.145039,8.997051e-13,0.369044,0.648042,2019-08-21 14:34:14,2019-09-29 23:54:06
ppt-textformatpainter-,-0.188533,0.088142,-2.138963,0.0324387,-0.361288,-0.015777,2019-05-06 08:05:42,2019-07-11 09:54:30
ppt-wopitokenrefresh-,0.49084,0.081444,6.026732,1.673086e-09,0.331213,0.650467,2019-05-01 00:04:11,2019-09-29 22:44:04


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
rf = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [50, 100, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,10,None],
    'oob_score': [True,False]}

CV_rfc = GridSearchCV(estimator=rf, n_jobs=-1, param_grid=param_grid, verbose=10, scoring='neg_log_loss',cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_, CV_rfc.best_score_)

## Overall

Flights that appear in each of the three apps

In [210]:
print('SATURATED MODEL')
logit = sm.Logit(y['all'],X['all'])
flogit = logit.fit()
print(flogit.summary())
coefficients = flogit.summary2().tables[1]
coefficients = coefficients[coefficients['P>|z|']<0.1]
coefficients['Odds Ratio']=np.exp(coefficients['Coef.'])
coefficients['O.R.LB']=np.exp(coefficients['[0.025'])
coefficients['O.R.UB']=np.exp(coefficients['0.975]'])
coefficients.join(flight_durations)

SATURATED MODEL
Optimization terminated successfully.
         Current function value: 0.572658
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    NPS   No. Observations:                47053
Model:                          Logit   Df Residuals:                    47028
Method:                           MLE   Df Model:                           24
Date:                Thu, 17 Oct 2019   Pseudo R-squ.:                -0.02428
Time:                        14:56:09   Log-Likelihood:                -26945.
converged:                       True   LL-Null:                       -26307.
                                        LLR p-value:                     1.000
                                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975],FlightStart,FlightEnd
docowner-canary-,0.314681,0.021259,14.80225,1.41664e-49,0.273014,0.356348,2019-05-01 00:04:11,2019-09-17 17:10:09
xls-overridebrowsershortcuts-,0.400569,0.042653,9.391334,5.9244929999999996e-21,0.31697,0.484167,2019-08-25 10:09:31,2019-09-29 23:55:04
wac-wordeditorcashdashisactivecoauth-,0.514753,0.14882,3.458908,0.0005423706,0.223072,0.806434,2019-09-24 18:20:12,2019-09-29 23:50:05
wac-wordshowpasteoptionsincontextmenu-,0.250356,0.060892,4.111488,3.93118e-05,0.13101,0.369702,2019-07-08 18:14:05,2019-09-29 23:50:05
canary-docowner-,0.391644,0.043537,8.995638,2.348633e-19,0.306313,0.476975,2019-09-18 18:19:11,2019-09-29 23:55:07
canary2-,0.320066,0.019397,16.500886,3.615509e-61,0.282049,0.358083,2019-05-01 00:04:11,2019-09-29 23:55:07
wac-worduseaadforfileupload-,0.312258,0.033318,9.37212,7.109002e-21,0.246956,0.37756,2019-07-10 18:59:06,2019-09-10 20:10:08
wacdash-,0.294835,0.020089,14.676704,9.090234e-49,0.255462,0.334208,2019-05-01 00:04:11,2019-09-29 23:55:07
wac-wordsdxpreloadpackagelist-,0.33212,0.05304,6.261674,3.808669e-10,0.228163,0.436077,2019-09-09 18:09:56,2019-09-29 23:55:07
firstrelease-,0.308134,0.035166,8.762366,1.911936e-18,0.239211,0.377058,2019-05-01 00:04:11,2019-09-29 23:55:07
