### LOADING LIBRARIES AND DATA

In [4]:
import pandas as pd
import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

In [5]:
search_terms = pd.read_csv(r'C:/Users/morrison/Desktop/Search Term Final Project/Search Term Final Project Data.csv')

In [6]:
search_terms.head()

Unnamed: 0,Search Term,Date,Hour,Minute,User ID,Total Unique Searches,Search Depth,Time after Search
0,Advisors,20180416,6,26,Anonymous-User,2,0,244.5
1,fiduciary asset sizing,20180601,16,37,c5efcf93-2c56-e711-8107-5065f38afab1,2,0,360.5
2,intermediary distribution,20180405,13,12,U001BHX,2,0,119.5
3,investor segmentation,20180611,4,32,U001COK,2,3,140.0
4,- The Next Generation of Planning,20180530,11,1,Anonymous-User,1,1,33.0


In [7]:
search_terms.dtypes

Search Term               object
Date                       int64
Hour                       int64
Minute                     int64
User ID                   object
Total Unique Searches      int64
Search Depth               int64
Time after Search        float64
dtype: object

### PREPROCESSING

In [8]:
#create binary user ID variable
#users with a weird looking string are existing clients in our system
#users with 'Anonymous-User' are not in our system
#existing users == 1 | unknown/new users == 0
search_terms['UserID'] = search_terms['User ID']
search_terms['terms'] = search_terms['Search Term']
search_terms['user_not'] = search_terms.UserID.map(lambda x: 0 if x == 'Anonymous-User' else 1)
search_terms.head()

Unnamed: 0,Search Term,Date,Hour,Minute,User ID,Total Unique Searches,Search Depth,Time after Search,UserID,terms,user_not
0,Advisors,20180416,6,26,Anonymous-User,2,0,244.5,Anonymous-User,Advisors,0
1,fiduciary asset sizing,20180601,16,37,c5efcf93-2c56-e711-8107-5065f38afab1,2,0,360.5,c5efcf93-2c56-e711-8107-5065f38afab1,fiduciary asset sizing,1
2,intermediary distribution,20180405,13,12,U001BHX,2,0,119.5,U001BHX,intermediary distribution,1
3,investor segmentation,20180611,4,32,U001COK,2,3,140.0,U001COK,investor segmentation,1
4,- The Next Generation of Planning,20180530,11,1,Anonymous-User,1,1,33.0,Anonymous-User,- The Next Generation of Planning,0


### EDA

In [9]:
search_terms.groupby(by='user_not').describe()

Unnamed: 0_level_0,Date,Date,Date,Date,Date,Date,Date,Date,Hour,Hour,...,Time after Search,Time after Search,Total Unique Searches,Total Unique Searches,Total Unique Searches,Total Unique Searches,Total Unique Searches,Total Unique Searches,Total Unique Searches,Total Unique Searches
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
user_not,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1934.0,20180520.0,83.338857,20180401.0,20180423.0,20180515.0,20180611.0,20180629.0,1934.0,12.1091,...,63.0,2557.0,1934.0,0.945708,0.228922,0.0,1.0,1.0,1.0,2.0
1,3569.0,20180520.0,81.615259,20180401.0,20180425.0,20180518.0,20180608.0,20180630.0,3569.0,12.516391,...,148.0,3293.0,3569.0,0.922387,0.270723,0.0,1.0,1.0,1.0,2.0


### CREATING TERM VECTORS

In [10]:
#defining features and target variables
X = search_terms.terms
y = search_terms.user_not

#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
#count vector

nb = MultinomialNB()

cnt_top_terms = []
cnt_ngrams = []
for k in range(1,10):
    print('ngrams: ' + str(k))
    cnt_vect = CountVectorizer(stop_words='english', max_features=1000, ngram_range=(1,k), min_df=2)
    X_train_cnt = cnt_vect.fit_transform(X_train)
    X_test_cnt = cnt_vect.transform(X_test)
    
    #count vector
    nb.fit(X_train_cnt, y_train)
    y_pred_class_cnt = nb.predict(X_test_cnt)

    #calculating accuracy.
    print('count vector accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_class_cnt)))
    cnt_ngrams.append(metrics.accuracy_score(y_test, y_pred_class_cnt))

    
 
    
#creates a list of the top terms
cnt_top_terms.append(tfidf_tf.sum)
tfidf_top_terms.sort()

ngrams: 1
count vector accuracy: 0.6736918604651163
ngrams: 2
count vector accuracy: 0.6729651162790697
ngrams: 3
count vector accuracy: 0.6729651162790697
ngrams: 4
count vector accuracy: 0.6722383720930233
ngrams: 5
count vector accuracy: 0.6722383720930233
ngrams: 6
count vector accuracy: 0.6744186046511628
ngrams: 7
count vector accuracy: 0.6722383720930233
ngrams: 8
count vector accuracy: 0.6715116279069767
ngrams: 9
count vector accuracy: 0.6722383720930233


In [20]:
#tdif vector
tfidf_top_terms = []
tfidf_ngrams = []
for k in range(1,10):
    print('ngrams: ' + str(k))
    tfidf_vect = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1,k), min_df=2)
    X_train_tfidf = tfidf_vect.fit_transform(X_train)
    X_test_tfidf = tfidf_vect.transform(X_test)
    
    #count vector
    nb.fit(X_train_tfidf, y_train)
    y_pred_class_tfidf = nb.predict(X_test_tfidf)

    #calculating accuracy.
    print('tf-idf vector accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_class_tfidf)))
    tfidf_ngrams.append(metrics.accuracy_score(y_test, y_pred_class_tfidf))
    


ngrams: 1
tf-idf vector accuracy: 0.6751453488372093
ngrams: 2
tf-idf vector accuracy: 0.6896802325581395
ngrams: 3
tf-idf vector accuracy: 0.6853197674418605
ngrams: 4
tf-idf vector accuracy: 0.6831395348837209
ngrams: 5
tf-idf vector accuracy: 0.6816860465116279
ngrams: 6
tf-idf vector accuracy: 0.6838662790697675
ngrams: 7
tf-idf vector accuracy: 0.6838662790697675
ngrams: 8
tf-idf vector accuracy: 0.6838662790697675
ngrams: 9
tf-idf vector accuracy: 0.6831395348837209


AttributeError: lower not found

In [134]:
import scipy as sp


feature_cols = ['terms', 'Search Depth', 'Time after Search']
X2 = search_terms[feature_cols]
y2 = search_terms.user_not

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=1)

# Use CountVectorizer with text column only.

for k in range(1,10):
    print('ngrams: ' + str(k))
    vect = TfidfVectorizer(ngram_range = (1,k))
    X_train_extra = vect.fit_transform(X_train.terms)
    X_test_extra = vect.transform(X_test.terms)

    # Cast other feature columns to float and convert to a sparse matrix.
    extra = sp.sparse.csr_matrix(X_train.drop('terms', axis=1).astype(float))


    # Combine sparse matrices.
    X_train_dtm_extra = sp.sparse.hstack((X_train_extra, extra))


    # Repeat for testing set.
    extra = sp.sparse.csr_matrix(X_test.drop('terms', axis=1).astype(float))
    X_test_dtm_extra = sp.sparse.hstack((X_test_extra, extra))

    # Use logistic regression with all features.
    logreg = LogisticRegression(C=1e9)
    logreg.fit(X_train_dtm_extra, y_train)
    y_pred_class = logreg.predict(X_test_dtm_extra)
    print('tfidf + features accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_class)))

ngrams: 1
tfidf + features accuracy: 0.7020348837209303
ngrams: 2
tfidf + features accuracy: 0.6947674418604651
ngrams: 3
tfidf + features accuracy: 0.6875
ngrams: 4
tfidf + features accuracy: 0.690406976744186
ngrams: 5
tfidf + features accuracy: 0.6882267441860465
ngrams: 6
tfidf + features accuracy: 0.688953488372093
ngrams: 7
tfidf + features accuracy: 0.690406976744186
ngrams: 8
tfidf + features accuracy: 0.6867732558139535
ngrams: 9
tfidf + features accuracy: 0.690406976744186


In [11]:
vect = TfidfVectorizer(ngram_range = (1,k))
X_train_extra = vect.fit_transform(X_train.terms)
X_test_extra = vect.transform(X_test.terms)

# Cast other feature columns to float and convert to a sparse matrix.
extra = sp.sparse.csr_matrix(X_train.drop('terms', axis=1).astype(float))


# Combine sparse matrices.
X_train_dtm_extra = sp.sparse.hstack((X_train_extra, extra))


# Repeat for testing set.
extra = sp.sparse.csr_matrix(X_test.drop('terms', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_extra, extra))

# Use logistic regression with all features.
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
featuresplus = metrics.accuracy_score(y_test, y_pred_class)


#creates a list of the top terms
featuresplus_top_terms = []
featuresplus = pd.DataFrame(vect.fit_transform(X_train).toarray(), columns=vect.get_feature_names())
featuresplus_top_terms.append(cnt_tf.sum)
featuresplus_top_terms.sort()

NameError: name 'k' is not defined

### BASELINE MODEL

I am going to use a simple probability based upon the percentage of users compared to non-users as the baseline by which to measure the accuracy of subsequent models

In [13]:
baseline_model = search_terms['user_not'].sum() / search_terms.user_not.count()
print(baseline_model)

0.6485553334544794


### COMPARING PREDICTIONS

In [14]:
#using Naive Bayes to predict users vs nonusers
nb = MultinomialNB()

#count vector
cnt_vect = CountVectorizer(stop_words='english', max_features=1000, ngram_range=(1,6), min_df=2)
X_train_cnt = cnt_vect.fit_transform(X_train)
X_test_cnt = cnt_vect.transform(X_test)

nb.fit(X_train_cnt, y_train)
y_pred_class_cnt = nb.predict(X_test_cnt)

#calculating accuracy.
print('count vector accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_class_cnt)))

#creates a list of the top terms
cnt_top_terms = []
cnt_tf = pd.DataFrame(cnt_vect.fit_transform(X_train).toarray(), columns=cnt_vect.get_feature_names())
cnt_top_terms.append(cnt_tf.sum)
cnt_top_terms.sort()


#tfidf vector
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1,2), min_df=2)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
    
nb.fit(X_train_tfidf, y_train)
y_pred_class_tfidf = nb.predict(X_test_tfidf)

#calculating accuracy.
print('Tf-IDF accuracy: '+ str(metrics.accuracy_score(y_test, y_pred_class_tfidf)))

#creates a list of the top terms
tfidf_top_terms = []
tf_idf = pd.DataFrame(tfidf_vect.fit_transform(X_train).toarray(), columns=tfidf_vect.get_feature_names())
tfidf_top_terms.append(tf_idf.sum)
tfidf_top_terms.sort()

print('-------------------------------------------')
print('baseline model : ' + str(baseline_model))

count vector accuracy: 0.6744186046511628
Tf-IDF accuracy: 0.6896802325581395
-------------------------------------------
baseline model : 0.6485553334544794


In [25]:
cnt_top_terms = cnt_tf.sum()
cnt_top_terms.sort_values(ascending = False)


management                                         183
asset                                              170
advisor                                            118
distribution                                       117
managed                                            115
retail                                             108
retirement                                         106
wealth                                             102
institutional                                       98
asset management                                    91
european                                            84
ria                                                 83
financial                                           80
product                                             79
fund                                                75
asia                                                75
accounts                                            75
markets                                             74
2017      

In [43]:
#Top terms for users vs nonusers for COUNT vector
cnt_merged_terms = pd.merge(search_terms, cnt_tf, left_index=True, right_index=True)
cnt_merged_terms_SORT = cnt_merged_terms.groupby(by='user_not').sum()
#cnt_merged_terms_SORT

cnt_merged_terms_SORT_USERS = cnt_merged_terms_SORT.iloc[[0]]
cnt_merged_terms_SORT_USERS

cnt_merged_terms_SORT_NONUSERS = cnt_merged_terms_SORT.iloc[[1]]
cnt_merged_terms_SORT_NONUSERS

Unnamed: 0_level_0,Date,Hour,Minute,Total Unique Searches,Search Depth,Time after Search,20,20 digital,2011,2015,...,worth,worth markets,worth markets 2016,worth ultra,worth ultra high,worth ultra high net,worth ultra high net worth,worth ultra high net worth markets,year,year review
user_not,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53922345848,33328,81972,2675,2773,555480.0,2,2,3,8,...,23,5,2,5,5,5,5,5,3,3


In [50]:
ligma1 = cnt_merged_terms_SORT_NONUSERS.sum()
ligma1.sort_values(ascending = False)

Date                                5.392235e+10
Time after Search                   5.554800e+05
Minute                              8.197200e+04
Hour                                3.332800e+04
Search Depth                        2.773000e+03
Total Unique Searches               2.675000e+03
management                          1.350000e+02
asset                               1.170000e+02
advisor                             9.400000e+01
managed                             7.700000e+01
wealth                              7.600000e+01
retirement                          7.200000e+01
distribution                        6.900000e+01
retail                              6.900000e+01
asset management                    6.900000e+01
institutional                       6.100000e+01
european                            5.300000e+01
accounts                            5.000000e+01
investor                            4.900000e+01
asia                                4.900000e+01
markets             

In [49]:
ligma = cnt_merged_terms_SORT_USERS.sum()
ligma.sort_values(ascending = False)

Date                                          2.936265e+10
Time after Search                             1.776105e+05
Minute                                        4.496200e+04
Hour                                          1.748700e+04
Search Depth                                  1.938000e+03
Total Unique Searches                         1.456000e+03
asset                                         5.300000e+01
management                                    4.800000e+01
distribution                                  4.800000e+01
retail                                        3.900000e+01
managed                                       3.800000e+01
ria                                           3.800000e+01
institutional                                 3.700000e+01
retirement                                    3.400000e+01
product                                       3.400000e+01
financial                                     3.300000e+01
european                                      3.100000e+

In [63]:
top_terms_tfidf = tf_idf.sum()
top_terms_tfidf.sort_values(ascending = False)

management                   72.827455
ria                          66.116057
asset                        65.060808
retirement                   63.329714
esg                          62.366235
advisor                      59.543192
institutional                59.291170
etf                          57.952796
distribution                 56.797682
managed                      55.131930
wealth                       53.338771
retail                       46.949740
product                      43.743210
asia                         39.837028
accounts                     39.277341
asset management             39.066848
ocio                         37.948158
robo                         37.728502
insurance                    36.781920
european                     36.332395
financial                    35.650657
managed accounts             34.901497
fund                         34.852919
global                       33.517520
account                      33.511952
assets                   

In [51]:
#Top terms for users vs nonusers for tfidf vector
tfidf_merged_terms = pd.merge(search_terms, tf_idf, left_index=True, right_index=True)
tfidf_merged_terms_SORT = tfidf_merged_terms.groupby(by='user_not').sum()
#cnt_merged_terms_SORT

tfidf_merged_terms_SORT_USERS = cnt_merged_terms_SORT.iloc[[0]]
tfidf_merged_terms_SORT_USERS

tfidf_merged_terms_SORT_NONUSERS = cnt_merged_terms_SORT.iloc[[1]]
tfidf_merged_terms_SORT_NONUSERS

Unnamed: 0_level_0,Date,Hour,Minute,Total Unique Searches,Search Depth,Time after Search,20,20 digital,2011,2015,...,worth,worth markets,worth markets 2016,worth ultra,worth ultra high,worth ultra high net,worth ultra high net worth,worth ultra high net worth markets,year,year review
user_not,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,53922345848,33328,81972,2675,2773,555480.0,2,2,3,8,...,23,5,2,5,5,5,5,5,3,3


In [57]:
ligma2 = tfidf_merged_terms_SORT_USERS.sum()
ligma2.sort_values(ascending = False)

Date                                          2.936265e+10
Time after Search                             1.776105e+05
Minute                                        4.496200e+04
Hour                                          1.748700e+04
Search Depth                                  1.938000e+03
Total Unique Searches                         1.456000e+03
asset                                         5.300000e+01
management                                    4.800000e+01
distribution                                  4.800000e+01
retail                                        3.900000e+01
managed                                       3.800000e+01
ria                                           3.800000e+01
institutional                                 3.700000e+01
retirement                                    3.400000e+01
product                                       3.400000e+01
financial                                     3.300000e+01
european                                      3.100000e+

In [55]:
ligma3 = tfidf_merged_terms_SORT_NONUSERS.sum()
ligma3.sort_values(ascending = False)

Date                                5.392235e+10
Time after Search                   5.554800e+05
Minute                              8.197200e+04
Hour                                3.332800e+04
Search Depth                        2.773000e+03
Total Unique Searches               2.675000e+03
management                          1.350000e+02
asset                               1.170000e+02
advisor                             9.400000e+01
managed                             7.700000e+01
wealth                              7.600000e+01
retirement                          7.200000e+01
distribution                        6.900000e+01
retail                              6.900000e+01
asset management                    6.900000e+01
institutional                       6.100000e+01
european                            5.300000e+01
accounts                            5.000000e+01
investor                            4.900000e+01
asia                                4.900000e+01
markets             

### APPLICATION
The relevant application is to be able to feed a search query into the model and get back a certain percentage/probability that the searcher is a user or non-user

In [104]:
def user_query_identifier(query):
    nb = MultinomialNB()
    tfidf_vect = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1,5), min_df=2)
    
    X_train_tfidf_app = tfidf_vect.fit_transform(X_train)
    X_test_tfidf_app = tfidf_vect.transform(X_test)
    
    nb.fit(X_train_tfidf_app, y_train)
    nb.predict(query)
    print()

In [105]:
user_query_identifier('advisors')

ValueError: invalid literal for int() with base 10: 'advisors'

### ADDING FEATURES

In [134]:
import scipy as sp


feature_cols = ['terms', 'Search Depth', 'Time after Search']
X2 = search_terms[feature_cols]
y2 = search_terms.user_not

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X2, y2, random_state=1)

# Use CountVectorizer with text column only.

for k in range(1,10):
    print('ngrams: ' + str(k))
    vect = TfidfVectorizer(ngram_range = (1,k))
    X_train_extra = vect.fit_transform(X_train.terms)
    X_test_extra = vect.transform(X_test.terms)

    # Cast other feature columns to float and convert to a sparse matrix.
    extra = sp.sparse.csr_matrix(X_train.drop('terms', axis=1).astype(float))


    # Combine sparse matrices.
    X_train_dtm_extra = sp.sparse.hstack((X_train_extra, extra))


    # Repeat for testing set.
    extra = sp.sparse.csr_matrix(X_test.drop('terms', axis=1).astype(float))
    X_test_dtm_extra = sp.sparse.hstack((X_test_extra, extra))

    # Use logistic regression with all features.
    logreg = LogisticRegression(C=1e9)
    logreg.fit(X_train_dtm_extra, y_train)
    y_pred_class = logreg.predict(X_test_dtm_extra)
    print('tfidf + features accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_class)))

ngrams: 1
tfidf + features accuracy: 0.7020348837209303
ngrams: 2
tfidf + features accuracy: 0.6947674418604651
ngrams: 3
tfidf + features accuracy: 0.6875
ngrams: 4
tfidf + features accuracy: 0.690406976744186
ngrams: 5
tfidf + features accuracy: 0.6882267441860465
ngrams: 6
tfidf + features accuracy: 0.688953488372093
ngrams: 7
tfidf + features accuracy: 0.690406976744186
ngrams: 8
tfidf + features accuracy: 0.6867732558139535
ngrams: 9
tfidf + features accuracy: 0.690406976744186


(4127, 2)

(4127, 1339)

(1376, 1339)

0.686046511627907


tfidf + features accuracy: 0.7020348837209303
