In [67]:
import nltk
import pandas as pd
import pickle

## Words that occur the most in company description and N-grams

### Data Processing Pipeline

In [16]:
company_df = pd.read_csv("data/company_desc_translated.csv", sep=";")

In [17]:
company_df

Unnamed: 0,company_name,score,description,description_len,description_en,category
0,Le Fourgon,49,Le Fourgon vous livre vos boissons consignees ...,342,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco
1,Comptoir des Vignes,49,Comptoir des Vignes est une enseigne de caves ...,877,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco
2,Shin Sekai,49,Bienvenue sur notre page Trustpilot ! Shin Sek...,391,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco
3,Nutri Naturel,49,"Nutri-Naturel.com, l'epicerie bio en ligne de ...",587,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco
4,Maison Martin - Le Piment Français,49,Maison Martin - Le Piment Francais est la prem...,149,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco
...,...,...,...,...,...,...
12991,Ljbautoparts,12,Vente de Pieces detachees Auto de Carrosserie ...,201,"Sale of auto body spare parts online: fender, ...",vehicles_transportation
12992,Aéroports de Paris,12,"Aeroports de Paris, avec ses trois plates-form...",478,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation
12993,Online SAS,17,"Hebergement mutualise avec trafic illimite, no...",83,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation
12994,shopequitation,12,Specialiste on-line de vente de materiel pour ...,269,Online specialist in the sale of horse riding ...,vehicles_transportation


In [18]:
company_df.isna().sum()

company_name          0
score                 0
description        1523
description_len       0
description_en     1528
category              0
dtype: int64

In [19]:
company_df = company_df.dropna(ignore_index=True)

In [20]:
company_df

Unnamed: 0,company_name,score,description,description_len,description_en,category
0,Le Fourgon,49,Le Fourgon vous livre vos boissons consignees ...,342,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco
1,Comptoir des Vignes,49,Comptoir des Vignes est une enseigne de caves ...,877,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco
2,Shin Sekai,49,Bienvenue sur notre page Trustpilot ! Shin Sek...,391,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco
3,Nutri Naturel,49,"Nutri-Naturel.com, l'epicerie bio en ligne de ...",587,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco
4,Maison Martin - Le Piment Français,49,Maison Martin - Le Piment Francais est la prem...,149,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco
...,...,...,...,...,...,...
11463,Ljbautoparts,12,Vente de Pieces detachees Auto de Carrosserie ...,201,"Sale of auto body spare parts online: fender, ...",vehicles_transportation
11464,Aéroports de Paris,12,"Aeroports de Paris, avec ses trois plates-form...",478,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation
11465,Online SAS,17,"Hebergement mutualise avec trafic illimite, no...",83,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation
11466,shopequitation,12,Specialiste on-line de vente de materiel pour ...,269,Online specialist in the sale of horse riding ...,vehicles_transportation


In [21]:
corpus = list(company_df["description_en"])
corpus

['Le Fourgon delivers your stored drinks to your home: the order is placed on lefourgon.com: beers, juices, sodas, water, milk, wines, soups, spirits, & co. we deliver to your home free of charge in the chosen niche and on the next visit we collect your empty bottles which we return washed to the producer for reuse zerodechet',
 'Comptoir des Vignes is a brand of cellars specializing in wines, champagnes, spirits, specialty beers, teas, coffees and delicatessens. Our cellars are differentiated by an original and modern presentation of the products, but also on the basis of advice adapted to the new trends and consumption habits of our customers. Each Comptoir des Vignes cellar offers a clear and warm setting which allows you to discover the wines in all simplicity and indulgence with: Highlighting the pairings Provision of recipe cards Regular events in store Organization of tasting evenings With our 50 cellars in France , our mission is to satisfy all consumers whatever their needs an

12996 Docs

In [22]:
len(corpus)

11468

#### English Tokenizer

In [23]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [30]:
text = corpus[3]

In [32]:
text

'Nutri-Naturel.com, the leading online organic grocery store since 2011! Created by 2 brothers passionate about nutrition and health, it offers you a large choice of organic, ethical, vegan, gluten-free superfoods and food supplements. Fast, quality service. Payment methodSecure payment (CB, Visa, Mastercard, Paypal, Check). DeliveryAt home or at a relay point. Free shipping for purchases over 60 (subject to conditions). Fast delivery within 48 hours. A question or a problem? Our customer service will answer you. Contact us at https://www.nutri-naturel.com'

In [31]:
tokenizer.tokenize(text)

['Nutri-Naturel.com, the leading online organic grocery store since 2011!',
 'Created by 2 brothers passionate about nutrition and health, it offers you a large choice of organic, ethical, vegan, gluten-free superfoods and food supplements.',
 'Fast, quality service.',
 'Payment methodSecure payment (CB, Visa, Mastercard, Paypal, Check).',
 'DeliveryAt home or at a relay point.',
 'Free shipping for purchases over 60 (subject to conditions).',
 'Fast delivery within 48 hours.',
 'A question or a problem?',
 'Our customer service will answer you.',
 'Contact us at https://www.nutri-naturel.com']

In [91]:
pipe1 = []
for desc in corpus:
    pipe1.append(tokenizer.tokenize(desc.lower()))

In [92]:
pipe1

[['le fourgon delivers your stored drinks to your home: the order is placed on lefourgon.com: beers, juices, sodas, water, milk, wines, soups, spirits, & co. we deliver to your home free of charge in the chosen niche and on the next visit we collect your empty bottles which we return washed to the producer for reuse zerodechet'],
 ['comptoir des vignes is a brand of cellars specializing in wines, champagnes, spirits, specialty beers, teas, coffees and delicatessens.',
  'our cellars are differentiated by an original and modern presentation of the products, but also on the basis of advice adapted to the new trends and consumption habits of our customers.',
  'each comptoir des vignes cellar offers a clear and warm setting which allows you to discover the wines in all simplicity and indulgence with: highlighting the pairings provision of recipe cards regular events in store organization of tasting evenings with our 50 cellars in france , our mission is to satisfy all consumers whatever t

#### Tokenize

In [43]:
import spacy

In [46]:
! python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/42.8 MB 1.7 MB/s eta 0:00:26
      --------------------------------------- 0.6/42.8 MB 6.1 MB/s eta 0:00:07
     - -------------------------------------- 1.2/42.8 MB 9.3 MB/s eta 0:00:05
     - -------------------------------------- 1.5/42.8 MB 8.9 MB/s eta 0:00:05
     - -------------------------------------- 2.0/42.8 MB 9.2 MB/s eta 0:00:05
     -- ------------------------------------- 2.6/42.8 MB 9.6 MB/s eta 0:00:05
     -- ------------------------------------- 3.1/42.8 MB 9.9 MB/s eta 0:00:05
     --- ------------------------------------ 3.7/42.8 MB 10.2 MB/s eta 0:00:04
     --- ------------------------------------ 3.9/42.8 MB 10.0 MB/s eta 0:00:04
     ---- -----------------------------

In [47]:
nlp = spacy.load("en_core_web_md")

In [61]:
doc = nlp(pipe1[0][0])

In [62]:
test = [token.text for token in doc]
test

['Le',
 'Fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 ':',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 ':',
 'beers',
 ',',
 'juices',
 ',',
 'sodas',
 ',',
 'water',
 ',',
 'milk',
 ',',
 'wines',
 ',',
 'soups',
 ',',
 'spirits',
 ',',
 '&',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [93]:
nlp = spacy.load("en_core_web_md")
pipe2 = []
for sentences in pipe1:
    tokens = []
    for sentence in sentences:
        doc = nlp(sentence)
        tokens.append([token.text for token in doc])
    pipe2.append(tokens)

In [94]:
pipe2[0][0]

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 ':',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 ':',
 'beers',
 ',',
 'juices',
 ',',
 'sodas',
 ',',
 'water',
 ',',
 'milk',
 ',',
 'wines',
 ',',
 'soups',
 ',',
 'spirits',
 ',',
 '&',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [95]:
len(pipe2[0][0])

67

In [96]:
# with open("pipes/pipe2", "wb") as fp:
#     pickle.dump(pipe2, fp)

#### Pipe to remove punctuation

In [35]:
# from nltk.tokenize import RegexpTokenizer

In [36]:
# tokenizer_regex = RegexpTokenizer("[\w]+")

In [41]:
# test = tokenizer_regex.tokenize(tokenized_corpus[0][0])
# test

['Le',
 'Fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon',
 'com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

#### Remove punctuation from list

In [51]:
import string 

In [52]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [53]:
test = [token for token in test if token not in string.punctuation]

In [54]:
test

['Le',
 'Fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [97]:
pipe3 = []
for company in pipe2:
    tokens = []
    for sentences in company:
        tokens.append([token for token in sentences if token not in string.punctuation])
    pipe3.append(tokens)

In [98]:
pipe3[0][0]

['le',
 'fourgon',
 'delivers',
 'your',
 'stored',
 'drinks',
 'to',
 'your',
 'home',
 'the',
 'order',
 'is',
 'placed',
 'on',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'we',
 'deliver',
 'to',
 'your',
 'home',
 'free',
 'of',
 'charge',
 'in',
 'the',
 'chosen',
 'niche',
 'and',
 'on',
 'the',
 'next',
 'visit',
 'we',
 'collect',
 'your',
 'empty',
 'bottles',
 'which',
 'we',
 'return',
 'washed',
 'to',
 'the',
 'producer',
 'for',
 'reuse',
 'zerodechet']

In [99]:
len(pipe3[0][0])

56

In [100]:
# with open("pipes/pipe3", "wb") as fp:
#     pickle.dump(pipe3, fp)

#### Remove Stop Words

In [55]:
from nltk.corpus import stopwords

In [56]:
stops = set(stopwords.words("english"))

In [57]:
[word for word in test if word not in stops]

['Le',
 'Fourgon',
 'delivers',
 'stored',
 'drinks',
 'home',
 'order',
 'placed',
 'lefourgon.com',
 'beers',
 'juices',
 'sodas',
 'water',
 'milk',
 'wines',
 'soups',
 'spirits',
 'co.',
 'deliver',
 'home',
 'free',
 'charge',
 'chosen',
 'niche',
 'next',
 'visit',
 'collect',
 'empty',
 'bottles',
 'return',
 'washed',
 'producer',
 'reuse',
 'zerodechet']

In [101]:
stops = set(stopwords.words("english"))
pipe4 = []
for company in pipe3:
    tokens = []
    for sentences in company:
        tokens.append([token for token in sentences if token not in stops])
    pipe4.append(tokens)

In [102]:
len(pipe4[0][0])

34

In [104]:
# with open("pipes/pipe4", "wb") as fp:
#     pickle.dump(pipe4, fp)

#### Bag of Words for company desc

In [None]:
BOW = []


#### Concat token lists for each comany description to count word appearance

In [None]:
pipe5 = []


#### Most Frequest words

In [105]:
from collections import defaultdict  # For word frequency