In [1]:
# Import everything I am going to need up front
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [15]:
open("C:/_Project/Diageo_Search/Search_term_report_last30.csv")

<_io.TextIOWrapper name='C:/_Project/Diageo_Search/Search_term_report_last30.csv' mode='r' encoding='cp1252'>

In [2]:
# Get csv file for search term report
search_term = pd.read_csv('C:/_Project/Diageo_Search/Search_term_report_last30.csv', sep = "\t")

In [16]:
search_term.to_csv("C:/_Project/Diageo_Search/Search_term_report_last30_new.csv", index = False)

In [3]:
# Take a look at first 5 rows
search_term.head(5)

Unnamed: 0,Match type,Search term,Added/Excluded,Campaign,Ad group,Keyword,Clicks,Impressions,CTR,Avg. CPC,Cost,Avg. position
0,exact (close variant),pineapple cocktails with vodka,,C: Pineapple,Pineapple,[pineapple vodka cocktail],0,1,0.00%,0.0,0.0,1.0
1,exact (close variant),white rushin,,R: White Russian,White Russian,[white russian],2,9,22.22%,0.42,0.84,1.1
2,exact (close variant),thanks giving cocotial,,L: Thanksgiving,Thanksgiving Cocktail - Exact,[thanksgiving cocktail],1,1,100.00%,1.57,1.57,2.0
3,exact (close variant),alcoholic thanksgiving drink,,L: Thanksgiving,Thanksgiving-Drink,[thanksgiving alcoholic drink],2,12,16.67%,3.13,6.26,1.3
4,exact,spiked punch for thanksgiving,Added,L: Thanksgiving,Thanksgiving-Punch,[spiked punch for thanksgiving],1,1,100.00%,0.38,0.38,1.0


In [222]:
search_term.dtypes

Match type         object
Search term        object
Added/Excluded     object
Campaign           object
Ad group           object
Keyword            object
Clicks              int64
Impressions         int64
CTR                object
Avg. CPC          float64
Cost              float64
Avg. position     float64
Cluster             int64
dtype: object

In [221]:
search_term['Cost'] = search_term['Cost'].str.replace(',','').astype(np.float64)

In [4]:
# Create a list of search term
search_term_list = list(search_term['Search term'])

In [5]:
# Take a look at first 10 rows of search term list
search_term_list[:10]

['pineapple cocktails with vodka',
 'white rushin',
 'thanks giving cocotial',
 'alcoholic thanksgiving drink',
 'spiked punch for thanksgiving',
 'spiked punch for thanksgiving',
 'what drinks can you make with blueberry vodka',
 'smirnoff peppermint bark martini',
 'white russisns',
 'how to make a dirt martini']

In [6]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [7]:
# Print stopwords
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [8]:
# Load nltk's SnowballStemmer as variable called 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [9]:
# Here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # First tokenize by sentence, then by word to ensure that punctiuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctiation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [10]:
# Not super pythonic, no, not at all
# Use extend so it's a big flat list of vacab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in search_term_list:
    allwords_stemmed = tokenize_and_stem(i) # For each item in 'search_term_list', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # Extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [11]:
# Create a pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [12]:
# Look up a stem and return a fuill token
vocab_frame.head(10)

Unnamed: 0,words
pineappl,pineapple
cocktail,cocktails
with,with
vodka,vodka
white,white
rushin,rushin
thank,thanks
give,giving
cocoti,cocotial
alcohol,alcoholic


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Define count vectorizer parameters
count_vectorizer = CountVectorizer(max_df = 0.7,
                                   min_df = 3,
                                   max_features = None,
                                   stop_words = 'english',
                                   strip_accents = 'unicode',
                                   analyzer = 'word',
                                   tokenizer = tokenize_only,
                                   ngram_range = (1,3)
                                   )

# Define tfidf vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df = 0.7,
                                   min_df = 3,
                                   max_features = None,
                                   stop_words = 'english',
                                   strip_accents = 'unicode',
                                   analyzer = 'word',
                                   use_idf = True,
                                   tokenizer = tokenize_only,
                                   ngram_range = (1,3)
                                  )

%time tfidf_matrix = tfidf_vectorizer.fit_transform(search_term_list) # Fit the tfidf vectorizer to search term list
%time count_matrix = count_vectorizer.fit_transform(search_term_list) # Fit the count vectorizer to search term list

tfidf_matrix.shape
count_matrix.shape

Wall time: 1.42 s
Wall time: 1.37 s


(8571, 2051)

In [17]:
# Define a list of the feature used in the count matrix
count = count_vectorizer.get_feature_names()

In [18]:
count_df = pd.DataFrame(count_matrix.toarray(), columns=count)

In [19]:
count_df.head(5)

Unnamed: 0,'s,'s good,'s good mix,'s smirnoff,'s white,'s white russian,100proof,4th,4th july,50ml,...,wins,winter,winter cocktails,wirh,wjite,world,woth,www,www smirnoff,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
count_df.sum(0).sort_values(ascending=False)

smirnoff                     3066
vodka                        2664
drinks                        687
drink                         644
thanksgiving                  557
recipe                        546
peppermint                    527
recipes                       523
martini                       458
make                          447
white                         411
caramel                       401
mix                           344
russian                       287
apple                         284
cocktail                      283
watermelon                    277
caramel vodka                 263
cocktails                     251
raspberry                     244
vanilla                       238
white russian                 220
strawberry                    216
smirnoff peppermint           206
mule                          201
peppermint vodka              199
punch                         183
vanilla vodka                 181
good                          179
green         

In [21]:
# Define a list of the features used in the tf-idf matrix
terms = tfidf_vectorizer.get_feature_names()

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [23]:
print(dist)

[[  0.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   9.66851472e-01
    1.00000000e+00   8.31346093e-01]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 [  1.00000000e+00   1.00000000e+00  -2.22044605e-16 ...,   1.00000000e+00
    1.00000000e+00   1.00000000e+00]
 ..., 
 [  9.66851472e-01   1.00000000e+00   1.00000000e+00 ...,  -2.22044605e-16
    1.00000000e+00   1.00000000e+00]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
   -2.22044605e-16   1.00000000e+00]
 [  8.31346093e-01   1.00000000e+00   1.00000000e+00 ...,   1.00000000e+00
    1.00000000e+00  -2.22044605e-16]]


In [32]:
# K-means clustering
from sklearn.cluster import KMeans

num_clusters = 12

km = KMeans(n_clusters = num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 4.17 s


In [57]:
#from sklearn.externals import joblib
#km = joblib.load('doc_cluster.pkl')
#clusters = km.labels_.tolist()

In [188]:
from sklearn.externals import joblib

# Uncomment the below to save your model
# Since I've already run my model I am loading from the pickle

joblib.dump(km, 'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
#clusters = km.labels_.tolist()

['doc_cluster.pkl']

In [33]:
from __future__ import print_function

print("Top terms per cluster:")
print()
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :15]: # replace 6 with n words per cluster
        print(' %s' % terms[ind], end=',')
        print()
    print() # add whitespace
    print() # add whitespace
    
    print("Cluster %d Search term:" % i, end='')
    for term in search_term.ix[i:15]['Search term']:
        print(' %s,' % term, end='')
        print()
    print() #add whitespace
    print() #add whitespace

print()
print()

Top terms per cluster:

Cluster 0 words: recipes,
 vodka recipes,
 vodka,
 drink recipes,
 smirnoff,
 drink,
 smirnoff recipes,
 martini recipes,
 cocktail recipes,
 caramel,
 caramel vodka,
 recipes smirnoff,
 martini,
 cocktail,
 vodka drink recipes,


Cluster 0 Search term: pineapple cocktails with vodka,
 white rushin,
 thanks giving cocotial,
 alcoholic thanksgiving drink,
 spiked punch for thanksgiving,
 spiked punch for thanksgiving,
 what drinks can you make with blueberry vodka,
 smirnoff peppermint bark martini,
 white russisns,
 how to make a dirt martini,
 smirnfoff limited edition,
 boueberry vodka,
 how much alcohol smirnoff watermelon ice,
 drinks with vadka,
 what mixes with smirnoff kissed carmel,
 what takes good with smirnoff,


Cluster 1 words: vodka,
 drink,
 vodka drink,
 caramel vodka,
 caramel,
 drink vodka,
 make,
 recipe,
 mix,
 drink caramel,
 drink caramel vodka,
 drink recipe,
 pineapple,
 vanilla vodka,
 watermelon,


Cluster 1 Search term: white rushin,
 

In [34]:
search_term['Cluster'] = clusters

In [35]:
search_term['Cluster'].value_counts()

6     4745
11     567
5      548
3      481
2      457
1      411
0      410
10     328
4      226
7      186
9      148
8       64
Name: Cluster, dtype: int64

In [36]:
grouped_impressions = search_term['Impressions'].groupby(search_term['Cluster'])

In [37]:
grouped_impressions.sum() # sum impressions per cluster

Cluster
0      18321
1      27009
2      68639
3      14586
4      29314
5     113382
6     237466
7     150616
8        433
9      44870
10    117939
11     51611
Name: Impressions, dtype: int64

In [38]:
grouped_impressions.mean() # average impressions per cluster

Cluster
0      44.685366
1      65.715328
2     150.194748
3      30.324324
4     129.707965
5     206.901460
6      50.045522
7     809.763441
8       6.765625
9     303.175676
10    359.570122
11     91.024691
Name: Impressions, dtype: float64

In [39]:
grouped_clicks = search_term['Clicks'].groupby(search_term['Cluster']) # group by cluster for aggregation purposes

In [40]:
grouped_clicks.sum() # sum clicks per cluster

Cluster
0      3461
1      2192
2      5110
3      4363
4      4718
5     14364
6     34600
7     31159
8       122
9      7953
10    22303
11     8777
Name: Clicks, dtype: int64

In [41]:
grouped_clicks.mean() # average clicks per cluster

Cluster
0       8.441463
1       5.333333
2      11.181619
3       9.070686
4      20.876106
5      26.211679
6       7.291886
7     167.521505
8       1.906250
9      53.736486
10     67.996951
11     15.479718
Name: Clicks, dtype: float64

In [42]:
grouped_clicks.mean()/grouped_impressions.mean()

Cluster
0     0.188909
1     0.081158
2     0.074447
3     0.299122
4     0.160947
5     0.126687
6     0.145705
7     0.206877
8     0.281755
9     0.177245
10    0.189106
11    0.170061
dtype: float64

In [192]:
grouped_cost = search_term['Cost'].groupby(search_term['Cluster']) # group by cluster for aggregation purposes

In [210]:
float(search_term['Cost'].values())

TypeError: 'numpy.ndarray' object is not callable

In [223]:
grouped_cost = search_term['Cost'].groupby(search_term['Cluster']) # group by cluster for aggregation purposes

In [224]:
grouped_cost.sum() # sum clicks per cluster

Cluster
0      4585.19
1      2834.11
2      3801.46
3      1538.24
4      7414.04
5     18830.25
6     42002.25
7     13940.93
8        87.20
9      4595.42
10     9315.83
11    16190.19
Name: Cost, dtype: float64

In [225]:
grouped_cost.mean()

Cluster
0     11.183390
1      6.895645
2      8.318293
3      3.198004
4     32.805487
5     34.361770
6      8.851897
7     74.951237
8      1.362500
9     31.050135
10    28.401921
11    28.554127
Name: Cost, dtype: float64

In [47]:
search_term.shape

(8571, 13)

In [46]:
count_df.shape

(8571, 2051)

In [48]:
search_term.head()

Unnamed: 0,Match type,Search term,Added/Excluded,Campaign,Ad group,Keyword,Clicks,Impressions,CTR,Avg. CPC,Cost,Avg. position,Cluster
0,exact (close variant),pineapple cocktails with vodka,,C: Pineapple,Pineapple,[pineapple vodka cocktail],0,1,0.00%,0.0,0.0,1.0,6
1,exact (close variant),white rushin,,R: White Russian,White Russian,[white russian],2,9,22.22%,0.42,0.84,1.1,6
2,exact (close variant),thanks giving cocotial,,L: Thanksgiving,Thanksgiving Cocktail - Exact,[thanksgiving cocktail],1,1,100.00%,1.57,1.57,2.0,6
3,exact (close variant),alcoholic thanksgiving drink,,L: Thanksgiving,Thanksgiving-Drink,[thanksgiving alcoholic drink],2,12,16.67%,3.13,6.26,1.3,5
4,exact,spiked punch for thanksgiving,Added,L: Thanksgiving,Thanksgiving-Punch,[spiked punch for thanksgiving],1,1,100.00%,0.38,0.38,1.0,5


In [211]:
search_term.to_csv('search_term_new.csv')

In [49]:
count_df.head()

Unnamed: 0,'s,'s good,'s good mix,'s smirnoff,'s white,'s white russian,100proof,4th,4th july,50ml,...,wins,winter,winter cocktails,wirh,wjite,world,woth,www,www smirnoff,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
df_final = search_term.join(count_df, lsuffix = '_search_term', rsuffix = '_count_df')

In [51]:
df_final.head()

Unnamed: 0,Match type,Search term,Added/Excluded,Campaign,Ad group,Keyword,Clicks,Impressions,CTR,Avg. CPC,...,wins,winter,winter cocktails,wirh,wjite,world,woth,www,www smirnoff,youtube
0,exact (close variant),pineapple cocktails with vodka,,C: Pineapple,Pineapple,[pineapple vodka cocktail],0,1,0.00%,0.0,...,0,0,0,0,0,0,0,0,0,0
1,exact (close variant),white rushin,,R: White Russian,White Russian,[white russian],2,9,22.22%,0.42,...,0,0,0,0,0,0,0,0,0,0
2,exact (close variant),thanks giving cocotial,,L: Thanksgiving,Thanksgiving Cocktail - Exact,[thanksgiving cocktail],1,1,100.00%,1.57,...,0,0,0,0,0,0,0,0,0,0
3,exact (close variant),alcoholic thanksgiving drink,,L: Thanksgiving,Thanksgiving-Drink,[thanksgiving alcoholic drink],2,12,16.67%,3.13,...,0,0,0,0,0,0,0,0,0,0
4,exact,spiked punch for thanksgiving,Added,L: Thanksgiving,Thanksgiving-Punch,[spiked punch for thanksgiving],1,1,100.00%,0.38,...,0,0,0,0,0,0,0,0,0,0


In [253]:
count_df.multiply(df_final['Impressions'],axis=0).sum(0).sort_values(ascending = False).sum()

3130596

In [71]:
df_final['Cluster'].head()

0    6
1    6
2    6
3    5
4    5
Name: Cluster, dtype: int64

In [69]:
df_wordcloud = count_df.multiply(df_final['Impressions'],axis=0).join(df_final['Cluster'].to_frame())

In [70]:
df_wordcloud.head()

Unnamed: 0,'s,'s good,'s good mix,'s smirnoff,'s white,'s white russian,100proof,4th,4th july,50ml,...,winter,winter cocktails,wirh,wjite,world,woth,www,www smirnoff,youtube,Cluster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [73]:
df_wordcloud = df_wordcloud.set_index('Cluster')

In [132]:
cluster0 = df_wordcloud[df_wordcloud.index ==0].sum(0).sort_values(ascending=False).dropna()
cluster1 = df_wordcloud[df_wordcloud.index ==1].sum(0).sort_values(ascending=False).dropna()
cluster2 = df_wordcloud[df_wordcloud.index ==2].sum(0).sort_values(ascending=False).dropna()
cluster3 = df_wordcloud[df_wordcloud.index ==3].sum(0).sort_values(ascending=False).dropna()
cluster4 = df_wordcloud[df_wordcloud.index ==4].sum(0).sort_values(ascending=False).dropna()
cluster5 = df_wordcloud[df_wordcloud.index ==5].sum(0).sort_values(ascending=False).dropna()
cluster6 = df_wordcloud[df_wordcloud.index ==6].sum(0).sort_values(ascending=False).dropna()
cluster7 = df_wordcloud[df_wordcloud.index ==7].sum(0).sort_values(ascending=False).dropna()
cluster8 = df_wordcloud[df_wordcloud.index ==8].sum(0).sort_values(ascending=False).dropna()
cluster9 = df_wordcloud[df_wordcloud.index ==9].sum(0).sort_values(ascending=False).dropna()
cluster10 = df_wordcloud[df_wordcloud.index ==10].sum(0).sort_values(ascending=False).dropna()
cluster11 = df_wordcloud[df_wordcloud.index ==11].sum(0).sort_values(ascending=False).dropna()

In [242]:
cluster_all = df_wordcloud.sum(0).sort_values(ascending=False).dropna()

In [142]:
dict(np.log(cluster4+1))

{'martini': 10.285854609398315,
 'recipe': 9.1689974084418004,
 'martini recipe': 9.1545104870155125,
 'dirty martini': 9.146761390333312,
 'dirty': 9.146761390333312,
 'make': 8.4553177876981493,
 'make martini': 8.4551049991028151,
 'pomegranate': 7.9006366130180048,
 'caramel': 6.4723462945009009,
 'caramel martini': 6.4676987261043539,
 'drink': 6.045005314036012,
 'pineapple': 5.5568280616995374,
 'pineapple martini': 5.5529595849216173,
 'drinks': 5.2094861528414214,
 'blueberry': 5.1984970312658261,
 'blueberry martini': 5.1929568508902104,
 'pomegranite': 5.0434251169192468,
 'recipe martini': 4.9344739331306915,
 'chicolate': 4.7706846244656651,
 'making': 4.5849674786705723,
 'pepermint': 3.8286413964890951,
 'pomegrante': 3.8066624897703196,
 'drop': 3.1780538303479458,
 'drop martini': 3.1780538303479458,
 'tecipe': 2.3978952727983707,
 'watermellon': 2.3025850929940459,
 'cocktail': 2.1972245773362196,
 'recepie': 2.1972245773362196,
 'smirnoff': 2.1972245773362196,
 'than

In [87]:
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from collections import Counter

In [139]:
cluster_term0 = search_term[['Search term', 'Impressions']][search_term['Cluster'] == 0]

In [140]:
cluster_term0.head()

Unnamed: 0,Search term,Impressions
25,drinks made with vanilla smirnoff recipes,1
70,smirnov peppermint vodka recipes,3
132,recipes with smirnoff kissed carmel,1
145,recipes for tamarindo vodka smirnof,1
165,caramel kissed vodka recipes,29


In [149]:
Dict = dict(cluster_term0.values.tolist())

In [155]:
Dict

{'3 olives birthday cake vodka recipes': 1,
 'acai blueberry vodka recipes': 2,
 'alcoholic drink recipes for thanksgiving': 20,
 'alcoholic thanksgiving drink recipes': 6,
 'armel vidka recipes': 1,
 'batch cocktail recipes': 7,
 'batch drink recipes': 1,
 'batched cocktail recipes': 1,
 'berry vodka recipes': 21,
 'best bloody mary recipes': 50,
 'best bloody marys recipes': 1,
 'best martini recipes': 46,
 'best recipes for bloody marys': 2,
 'best thanksgiving martini recipes': 2,
 'beverage recipes with caramel vodka': 1,
 'black cherry vodka recipes': 11,
 'blood mary recipes': 1,
 'blood orange martini recipes': 1,
 'bloody mary mix recipes': 1,
 'bloody mary recipes': 485,
 'bloodymary recipes': 8,
 'blueberry martini recipes': 70,
 'blueberry smirnoff recipes': 18,
 'blueberry vodka drink recipes': 6,
 'blueberry vodka recipes': 72,
 'cake vodka recipes': 107,
 'cale vodka recipes': 1,
 'canberry cocktail recipes': 1,
 'caramel kiss vodka recipes': 14,
 'caramel kissed vodka r

In [248]:
mask = np.array(Image.open("smirnoff_bg.jpg"))

In [249]:
image_colors = ImageColorGenerator(mask)

In [250]:
wc = WordCloud(background_color = "white", mask = mask, color_func = image_colors)

In [245]:
wc = WordCloud(background_color = "white", mask = mask, color_func = image_colors)

In [251]:
wc.generate_from_frequencies(dict(np.log(cluster_all+1.0001)))

<wordcloud.wordcloud.WordCloud at 0x251de484470>

In [252]:
wc.to_file("cluster_new_all.png")

<wordcloud.wordcloud.WordCloud at 0x251de484470>

In [157]:
np.log(1.00001)

9.9999500003988414e-06

In [232]:
search_term.dtypes

Match type         object
Search term        object
Added/Excluded     object
Campaign           object
Ad group           object
Keyword            object
Clicks              int64
Impressions         int64
CTR                object
Avg. CPC          float64
Cost              float64
Avg. position     float64
Cluster            object
dtype: object

In [231]:
search_term['Cluster'] = search_term['Cluster'].astype('str')

In [234]:
search_term['Imp_log'] = np.log(search_term['Impressions']+1)
search_term['Click_log'] = np.log(search_term['Clicks']+1)
search_term['Cost_log'] = np.log(search_term['Cost']+1)

In [235]:
import statsmodels.formula.api as sm

In [241]:
result = sm.ols(formula = "Cost_log ~ Cluster", data = search_term).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               Cost_log   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     27.05
Date:                Fri, 01 Dec 2017   Prob (F-statistic):           3.15e-56
Time:                        16:56:43   Log-Likelihood:                -12252.
No. Observations:                8571   AIC:                         2.453e+04
Df Residuals:                    8559   BIC:                         2.461e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
Intercept         0.8814      0.050     17.648