In [99]:
import nltk
import pandas as pd
import pickle

## Words that occur the most in company description and N-grams

### Data Processing Pipeline

In [100]:
company_df = pd.read_csv("data/company_desc_translated.csv", sep=";")

In [101]:
company_df

Unnamed: 0,company_name,score,description,description_len,description_en,category
0,Le Fourgon,49,Le Fourgon vous livre vos boissons consignees ...,342,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco
1,Comptoir des Vignes,49,Comptoir des Vignes est une enseigne de caves ...,877,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco
2,Shin Sekai,49,Bienvenue sur notre page Trustpilot ! Shin Sek...,391,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco
3,Nutri Naturel,49,"Nutri-Naturel.com, l'epicerie bio en ligne de ...",587,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco
4,Maison Martin - Le Piment Français,49,Maison Martin - Le Piment Francais est la prem...,149,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco
...,...,...,...,...,...,...
12991,Ljbautoparts,12,Vente de Pieces detachees Auto de Carrosserie ...,201,"Sale of auto body spare parts online: fender, ...",vehicles_transportation
12992,Aéroports de Paris,12,"Aeroports de Paris, avec ses trois plates-form...",478,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation
12993,Online SAS,17,"Hebergement mutualise avec trafic illimite, no...",83,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation
12994,shopequitation,12,Specialiste on-line de vente de materiel pour ...,269,Online specialist in the sale of horse riding ...,vehicles_transportation


In [102]:
company_df.isna().sum()

company_name          1
score                 0
description        1523
description_len       0
description_en     1528
category              0
dtype: int64

In [103]:
company_df = company_df.dropna(ignore_index=True)

In [104]:
company_df

Unnamed: 0,company_name,score,description,description_len,description_en,category
0,Le Fourgon,49,Le Fourgon vous livre vos boissons consignees ...,342,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco
1,Comptoir des Vignes,49,Comptoir des Vignes est une enseigne de caves ...,877,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco
2,Shin Sekai,49,Bienvenue sur notre page Trustpilot ! Shin Sek...,391,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco
3,Nutri Naturel,49,"Nutri-Naturel.com, l'epicerie bio en ligne de ...",587,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco
4,Maison Martin - Le Piment Français,49,Maison Martin - Le Piment Francais est la prem...,149,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco
...,...,...,...,...,...,...
11463,Ljbautoparts,12,Vente de Pieces detachees Auto de Carrosserie ...,201,"Sale of auto body spare parts online: fender, ...",vehicles_transportation
11464,Aéroports de Paris,12,"Aeroports de Paris, avec ses trois plates-form...",478,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation
11465,Online SAS,17,"Hebergement mutualise avec trafic illimite, no...",83,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation
11466,shopequitation,12,Specialiste on-line de vente de materiel pour ...,269,Online specialist in the sale of horse riding ...,vehicles_transportation


In [105]:
corpus = list(company_df["description_en"])
corpus

['Le Fourgon delivers your stored drinks to your home: the order is placed on lefourgon.com: beers, juices, sodas, water, milk, wines, soups, spirits, & co. we deliver to your home free of charge in the chosen niche and on the next visit we collect your empty bottles which we return washed to the producer for reuse zerodechet',
 'Comptoir des Vignes is a brand of cellars specializing in wines, champagnes, spirits, specialty beers, teas, coffees and delicatessens. Our cellars are differentiated by an original and modern presentation of the products, but also on the basis of advice adapted to the new trends and consumption habits of our customers. Each Comptoir des Vignes cellar offers a clear and warm setting which allows you to discover the wines in all simplicity and indulgence with: Highlighting the pairings Provision of recipe cards Regular events in store Organization of tasting evenings With our 50 cellars in France , our mission is to satisfy all consumers whatever their needs an

12996 Docs

In [106]:
len(corpus)

11468

#### English Tokenizer

In [107]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [108]:
text = corpus[3]

In [109]:
text

'Nutri-Naturel.com, the leading online organic grocery store since 2011! Created by 2 brothers passionate about nutrition and health, it offers you a large choice of organic, ethical, vegan, gluten-free superfoods and food supplements. Fast, quality service. Payment methodSecure payment (CB, Visa, Mastercard, Paypal, Check). DeliveryAt home or at a relay point. Free shipping for purchases over 60 (subject to conditions). Fast delivery within 48 hours. A question or a problem? Our customer service will answer you. Contact us at https://www.nutri-naturel.com'

In [110]:
tokenizer.tokenize(text)

['Nutri-Naturel.com, the leading online organic grocery store since 2011!',
 'Created by 2 brothers passionate about nutrition and health, it offers you a large choice of organic, ethical, vegan, gluten-free superfoods and food supplements.',
 'Fast, quality service.',
 'Payment methodSecure payment (CB, Visa, Mastercard, Paypal, Check).',
 'DeliveryAt home or at a relay point.',
 'Free shipping for purchases over 60 (subject to conditions).',
 'Fast delivery within 48 hours.',
 'A question or a problem?',
 'Our customer service will answer you.',
 'Contact us at https://www.nutri-naturel.com']

In [111]:
pipe1 = []
for desc in corpus:
    pipe1.append(tokenizer.tokenize(desc.lower()))

In [112]:
pipe1

[['le fourgon delivers your stored drinks to your home: the order is placed on lefourgon.com: beers, juices, sodas, water, milk, wines, soups, spirits, & co. we deliver to your home free of charge in the chosen niche and on the next visit we collect your empty bottles which we return washed to the producer for reuse zerodechet'],
 ['comptoir des vignes is a brand of cellars specializing in wines, champagnes, spirits, specialty beers, teas, coffees and delicatessens.',
  'our cellars are differentiated by an original and modern presentation of the products, but also on the basis of advice adapted to the new trends and consumption habits of our customers.',
  'each comptoir des vignes cellar offers a clear and warm setting which allows you to discover the wines in all simplicity and indulgence with: highlighting the pairings provision of recipe cards regular events in store organization of tasting evenings with our 50 cellars in france , our mission is to satisfy all consumers whatever t

#### Tokenize

In [113]:
import spacy

In [114]:
! python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 640.0 kB/s eta 0:01:07
     ---------------------------------------- 0.1/42.8 MB 1.4 MB/s eta 0:00:30
     - -------------------------------------- 1.3/42.8 MB 10.6 MB/s eta 0:00:04
     --- ------------------------------------ 4.2/42.8 MB 24.6 MB/s eta 0:00:02
     --------- ------------------------------ 9.6/42.8 MB 44.0 MB/s eta 0:00:01
     ------------- ------------------------ 15.2/42.8 MB 129.5 MB/s eta 0:00:01
     ------------------ ------------------- 20.8/42.8 MB 131.2 MB/s eta 0:00:01
     ----------------------- -------------- 26.4/42.8 MB 131.2 MB/s eta 0:00:01
     ---------------------------- --------- 32.0/42.8 MB 131.2 MB/s eta 0:00:01
     ----------------------------

In [115]:
nlp = spacy.load("en_core_web_md")

In [116]:
doc = nlp(pipe1[0][0])

In [117]:
test = [token.text for token in doc]
test

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 ':',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 ':',
 'beers',
 ',',
 'juices',
 ',',
 'sodas',
 ',',
 'water',
 ',',
 'milk',
 ',',
 'wines',
 ',',
 'soups',
 ',',
 'spirits',
 ',',
 '&',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [118]:
nlp = spacy.load("en_core_web_md")
pipe2 = []
for sentences in pipe1:
    tokens = []
    for sentence in sentences:
        doc = nlp(sentence)
        tokens.append([token.text for token in doc])
    pipe2.append(tokens)

In [119]:
pipe2[0][0]

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 ':',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 ':',
 'beers',
 ',',
 'juices',
 ',',
 'sodas',
 ',',
 'water',
 ',',
 'milk',
 ',',
 'wines',
 ',',
 'soups',
 ',',
 'spirits',
 ',',
 '&',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [120]:
len(pipe2[0][0])

67

In [121]:
# with open("pipes/pipe2", "wb") as fp:
#     pickle.dump(pipe2, fp)

#### Pipe to remove punctuation

In [122]:
# from nltk.tokenize import RegexpTokenizer

In [123]:
# tokenizer_regex = RegexpTokenizer("[\w]+")

In [124]:
# test = tokenizer_regex.tokenize(tokenized_corpus[0][0])
# test

#### Remove punctuation from list

In [125]:
import string 

In [126]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [127]:
test = [token for token in test if token not in string.punctuation]

In [128]:
test

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [129]:
pipe3 = []
for company in pipe2:
    tokens = []
    for sentences in company:
        tokens.append([token for token in sentences if token not in string.punctuation])
    pipe3.append(tokens)

In [130]:
pipe3[0][0]

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [131]:
len(pipe3[0][0])

56

In [132]:
# with open("pipes/pipe3", "wb") as fp:
#     pickle.dump(pipe3, fp)

#### Remove Stop Words

In [133]:
from nltk.corpus import stopwords

In [134]:
stops = set(stopwords.words("english"))

In [135]:
[word for word in test if word not in stops]

['le',
 'fourgon',
 'delivers',
 'stored',
 'drinks',
 'home',
 'order',
 'placed',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'deliver',
 'home',
 'free',
 'charge',
 'chosen',
 'niche',
 'next',
 'visit',
 'collect',
 'empty',
 'bottles',
 'return',
 'washed',
 'producer',
 'reuse',
 'zerodechet']

In [136]:
stops = set(stopwords.words("english"))
pipe4 = []
for company in pipe3:
    tokens = []
    for sentences in company:
        tokens.append([token for token in sentences if token not in stops])
    pipe4.append(tokens)

In [137]:
len(pipe4[0][0])

34

In [138]:
# with open("pipes/pipe4", "wb") as fp:
#     pickle.dump(pipe4, fp)

In [139]:
with open("pipes/pipe4", "rb") as fp:   # Unpickling
    pipe4 = pickle.load(fp)

### Concat

In [140]:
pipe5 = []
for company in pipe4:
    pipe5.append(" ".join([x for lst in company for x in lst]))

In [141]:
pipe5[0]

'le fourgon delivers stored drinks home order placed lefourgon.com beers juices sodas water milk wines soups spirits co. deliver home free charge chosen niche next visit collect empty bottles return washed producer reuse zerodechet'

In [142]:
pipe5[1]

'comptoir des vignes brand cellars specializing wines champagnes spirits specialty beers teas coffees delicatessens cellars differentiated original modern presentation products also basis advice adapted new trends consumption habits customers comptoir des vignes cellar offers clear warm setting allows discover wines simplicity indulgence highlighting pairings provision recipe cards regular events store organization tasting evenings 50 cellars france mission satisfy consumers whatever needs desires wide range products services good value money also thanks passion wine merchants'

### Lemmatize words

In [143]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [144]:
from nltk.stem.wordnet import WordNetLemmatizer

In [145]:
lemma = WordNetLemmatizer()
pipe6 = []
for company in pipe5:
    # lemm = nltk.pos_tag(nltk.word_tokenize(company))
    # print(lemm)
    pipe6.append([lemma.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(company)])

In [146]:
pipe6[0]

['le',
 'fourgon',
 'delivers',
 'store',
 'drink',
 'home',
 'order',
 'place',
 'lefourgon.com',
 'beer',
 'juice',
 'soda',
 'water',
 'milk',
 'wine',
 'soup',
 'spirit',
 'co.',
 'deliver',
 'home',
 'free',
 'charge',
 'chosen',
 'niche',
 'next',
 'visit',
 'collect',
 'empty',
 'bottle',
 'return',
 'wash',
 'producer',
 'reuse',
 'zerodechet']

#### Remove Word with length under 3

In [147]:
pipe7 = []
for company in pipe6:
    pipe7.append([word for word in company if len(word) > 3])

In [148]:
pipe7[0]

['fourgon',
 'delivers',
 'store',
 'drink',
 'home',
 'order',
 'place',
 'lefourgon.com',
 'beer',
 'juice',
 'soda',
 'water',
 'milk',
 'wine',
 'soup',
 'spirit',
 'deliver',
 'home',
 'free',
 'charge',
 'chosen',
 'niche',
 'next',
 'visit',
 'collect',
 'empty',
 'bottle',
 'return',
 'wash',
 'producer',
 'reuse',
 'zerodechet']

In [149]:
# with open("pipes/topic_modeling_pipe", "wb") as fp:
#     pickle.dump(pipe7, fp)

#### Most Frequest words

In [150]:
with open('pipes/topic_modeling_pipe', 'rb') as fp:
    pipe7 = pickle.load(fp)

In [151]:
from collections import defaultdict  # For word frequency

We use the names of the companies to attribute their 10 most frequent words.

In [152]:
company_id = 0
company_freq_dict = dict()
for company in pipe7:
    freq_dict = defaultdict(int)

    for word in company: # Count word frequency in each company description
        freq_dict[word] += 1
    company_freq_dict[company_df["company_name"][company_id]] = freq_dict # Add company name as key to dict and the frequency dictionnary as value

    company_id += 1

In [155]:
# we want to print the dictionary of "Le Fourgon" description
company_freq_dict["Le Fourgon"]

defaultdict(int,
            {'fourgon': 1,
             'delivers': 1,
             'store': 1,
             'drink': 1,
             'home': 2,
             'order': 1,
             'place': 1,
             'lefourgon.com': 1,
             'beer': 1,
             'juice': 1,
             'soda': 1,
             'water': 1,
             'milk': 1,
             'wine': 1,
             'soup': 1,
             'spirit': 1,
             'deliver': 1,
             'free': 1,
             'charge': 1,
             'chosen': 1,
             'niche': 1,
             'next': 1,
             'visit': 1,
             'collect': 1,
             'empty': 1,
             'bottle': 1,
             'return': 1,
             'wash': 1,
             'producer': 1,
             'reuse': 1,
             'zerodechet': 1})

In [156]:
# now we diplay the 10 most frequent words for each company description
for company in company_freq_dict:
    print(company)
    print(sorted(company_freq_dict[company], key=company_freq_dict[company].get, reverse=True)[:10])

Le Fourgon
['home', 'fourgon', 'delivers', 'store', 'drink', 'order', 'place', 'lefourgon.com', 'beer', 'juice']
Comptoir des Vignes
['cellar', 'wine', 'comptoir', 'vignes', 'product', 'also', 'brand', 'specialize', 'champagne', 'spirit']
Shin Sekai
['store', 'product', 'welcome', 'trustpilot', 'page', 'shin', 'sekai', 'online', 'figurine', 'goodie']
Nutri Naturel
['organic', 'free', 'fast', 'service', 'payment', 'nutri-naturel.com', 'lead', 'online', 'grocery', 'store']
Maison Martin - Le Piment Français
['france', 'maison', 'martin', 'piment', 'francais', 'first', 'brand', 'artisanal', 'sauce', 'cooked']
Belleville Brulerie - Paris
['coffee', 'belleville', 'brulerie', 'paris', 'quality', 'roaster', 'specialize', 'fresh', 'filter', 'bodum']
Spiruline des îles d'or
['spirulina', 'base', 'product']
La Tournée
['container', 'deposit', 'tournee', 'pass', 'neighborhood', 'several', 'time', 'week', 'electric', 'vehicle']
Greendogs Cbd 
['sale', 'premium', 'product', 'flower', 'liquid', 'res