In [25]:
import pickle
import pandas as pd
import spacy
import re
from time import time
import multiprocessing

# Embedding model trainings

### Embed company descriptions

#### Import Data

In [None]:
with open("pipes/pipe4", "rb") as fp:   # Unpickling
    pipe4 = pickle.load(fp)

In [None]:
pipe4[0]

[['le',
  'fourgon',
  'delivers',
  'stored',
  'drinks',
  'home',
  'order',
  'placed',
  'lefourgon.com',
  'beers',
  'juices',
  'sodas',
  'water',
  'milk',
  'wines',
  'soups',
  'spirits',
  'co.',
  'deliver',
  'home',
  'free',
  'charge',
  'chosen',
  'niche',
  'next',
  'visit',
  'collect',
  'empty',
  'bottles',
  'return',
  'washed',
  'producer',
  'reuse',
  'zerodechet']]

In [3]:
company_df = pd.read_csv("data/company_desc_translated.csv", sep=";")[["company_name", "description_en"]]

In [4]:
company_df

Unnamed: 0,company_name,description_en
0,Le Fourgon,Le Fourgon delivers your stored drinks to your...
1,Comptoir des Vignes,Comptoir des Vignes is a brand of cellars spec...
2,Shin Sekai,Welcome to our Trustpilot page! Shin Sekai is ...
3,Nutri Naturel,"Nutri-Naturel.com, the leading online organic ..."
4,Maison Martin - Le Piment Français,Maison Martin - Le Piment Francais is the firs...
...,...,...
12991,Ljbautoparts,"Sale of auto body spare parts online: fender, ..."
12992,Aéroports de Paris,"Aeroports de Paris, with its three platforms, ..."
12993,Online SAS,"Shared hosting with unlimited traffic, domain ..."
12994,shopequitation,Online specialist in the sale of horse riding ...


### Data Cleaning

In [6]:
nlp = spacy.load("en_core_web_sm") 

In [7]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop and len(token.text) > 2]

    if len(txt) > 2:
        return ' '.join(txt)

In [9]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in company_df['description_en'])

In [11]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

Time to clean up everything: 1.1 mins


In [12]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(11365, 1)

In [13]:
df_clean

Unnamed: 0,clean
0,fourgon deliver store drink home order place l...
1,comptoir des vigne brand cellar specialize win...
2,welcome trustpilot page shin sekai online figu...
3,nutri naturel com lead online organic grocery ...
4,maison martin piment francais brand artisanal ...
...,...
12991,sale auto body spare part online fender bumper...
12992,aeroport paris platform major connection point...
12993,share host unlimited traffic domain dedicated ...
12994,online specialist sale horse ride equipment sa...


In [14]:
df_clean.head(1).values

array([['fourgon deliver store drink home order place lefourgon com beers juice sodas water milk wine soup spirit deliver home free charge choose niche visit collect bottle return wash producer reuse zerodechet']],
      dtype=object)

In [15]:
from gensim.models.phrases import Phrases, Phraser

In [16]:
sent = [row.split() for row in df_clean['clean']]

In [17]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [18]:
bigram = Phraser(phrases)

In [19]:
sentences = bigram[sent]

In [20]:
sentences[0]

['fourgon',
 'deliver',
 'store',
 'drink',
 'home',
 'order',
 'place',
 'lefourgon',
 'com',
 'beers',
 'juice',
 'sodas',
 'water',
 'milk',
 'wine',
 'soup',
 'spirit',
 'deliver_home',
 'free_charge',
 'choose',
 'niche',
 'visit',
 'collect',
 'bottle',
 'return',
 'wash',
 'producer',
 'reuse',
 'zerodechet']

In [22]:
from collections import defaultdict 

In [23]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

26671

In [24]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['offer',
 'product',
 'service',
 'france',
 'find',
 'quality',
 'good',
 'brand',
 'online',
 'support']

In [30]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

16

#### Word2Vec

In [27]:
from gensim.models import FastText, Word2Vec

In [31]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [32]:
w2v_model.build_vocab(sentences, progress_per=10000)

Time to build vocab: 0.01 mins


In [33]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

Time to train the model: 0.23 mins


In [35]:
similar_words = w2v_model.wv.most_similar(positive=["product"])
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

high_quality: 0.7049179673194885
competitive_price: 0.6737679839134216
parapharmacy: 0.6604810953140259
food_supplement: 0.6533199548721313
assortment: 0.6437912583351135
organic: 0.6400367617607117
range: 0.6325221657752991
aquarium: 0.6325218081474304
farming: 0.6221839785575867
frozen: 0.6185584664344788


#### Fasttext

#### GloVe