In [14]:
import pandas as pd
import re

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [6]:
company_df = pd.read_csv("data/company_desc_translated.csv", sep=";")[["company_name", "description_en", "category"]]

In [7]:
company_df

Unnamed: 0,company_name,description_en,category
0,Le Fourgon,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco
1,Comptoir des Vignes,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco
2,Shin Sekai,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco
3,Nutri Naturel,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco
4,Maison Martin - Le Piment Français,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco
...,...,...,...
12991,Ljbautoparts,"Sale of auto body spare parts online: fender, ...",vehicles_transportation
12992,Aéroports de Paris,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation
12993,Online SAS,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation
12994,shopequitation,Online specialist in the sale of horse riding ...,vehicles_transportation


In [9]:
company_df = company_df.dropna(ignore_index=False)

In [10]:
company_df.value_counts("category")

category
electronics_technology        1183
home_garden                   1108
shopping_fashion              1065
money_insurance               1034
events_entertainment           766
beauty_wellbeing               755
food_beverages_tobacco         740
construction_manufactoring     709
business_services              683
education_training             656
vehicles_transportation        616
hobbies_crafts                 416
travel_vacation                407
media_publishing               323
sports                         211
animals_pets                   200
home_services                  144
health_medical                 133
legal_services_government      132
public_local_services          114
restaurants_bars                65
utilities                        8
Name: count, dtype: int64

In [11]:
import spacy

In [12]:
nlp = spacy.load("en_core_web_sm") 

In [13]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop and len(token.text) > 2]

    if len(txt) > 2:
        return ' '.join(txt)

In [15]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in company_df['description_en'])

In [16]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

In [17]:
company_df["tokenized_desc"] = txt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  company_df["tokenized_desc"] = txt


In [19]:
company_df

Unnamed: 0,company_name,description_en,category,tokenized_desc
0,Le Fourgon,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco,fourgon deliver store drink home order place l...
1,Comptoir des Vignes,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco,comptoir des vigne brand cellar specialize win...
2,Shin Sekai,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco,welcome trustpilot page shin sekai online figu...
3,Nutri Naturel,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco,nutri naturel com lead online organic grocery ...
4,Maison Martin - Le Piment Français,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco,maison martin piment francais brand artisanal ...
...,...,...,...,...
12991,Ljbautoparts,"Sale of auto body spare parts online: fender, ...",vehicles_transportation,sale auto body spare part online fender bumper...
12992,Aéroports de Paris,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation,aeroport paris platform major connection point...
12993,Online SAS,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation,share host unlimited traffic domain dedicated ...
12994,shopequitation,Online specialist in the sale of horse riding ...,vehicles_transportation,online specialist sale horse ride equipment sa...


In [18]:
company_df.isna().sum()

company_name       0
description_en     0
category           0
tokenized_desc    78
dtype: int64

In [20]:
company_df = company_df.dropna(ignore_index=False)

In [25]:
company_df["tokenized_desc"].str.split()

0        [fourgon, deliver, store, drink, home, order, ...
1        [comptoir, des, vigne, brand, cellar, speciali...
2        [welcome, trustpilot, page, shin, sekai, onlin...
3        [nutri, naturel, com, lead, online, organic, g...
4        [maison, martin, piment, francais, brand, arti...
                               ...                        
12991    [sale, auto, body, spare, part, online, fender...
12992    [aeroport, paris, platform, major, connection,...
12993    [share, host, unlimited, traffic, domain, dedi...
12994    [online, specialist, sale, horse, ride, equipm...
12995    [private, shuttle, company, airport, train, st...
Name: tokenized_desc, Length: 11390, dtype: object

In [21]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [22]:
tagged_data

[TaggedDocument(words=['i', 'love', 'machine', 'learning', '.', 'its', 'awesome', '.'], tags=['0']),
 TaggedDocument(words=['i', 'love', 'coding', 'in', 'python'], tags=['1']),
 TaggedDocument(words=['i', 'love', 'building', 'chatbots'], tags=['2']),
 TaggedDocument(words=['they', 'chat', 'amagingly', 'well'], tags=['3'])]

In [63]:
len(list(company_df["tokenized_desc"]))

11390

In [34]:
tagged_data = [TaggedDocument(words=word_tokenize(d), tags=[str(i)]) for i, d in enumerate(list(company_df["tokenized_desc"]))]

In [62]:
len(tagged_data)

11390

In [67]:
[n.tags[0] for n in tagged_data]

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [55]:
tagged_data.__getitem__(2).words

['welcome',
 'trustpilot',
 'page',
 'shin',
 'sekai',
 'online',
 'figurine',
 'goodie',
 'store',
 'offer',
 'official',
 'product',
 'manga',
 'anime',
 'video',
 'game',
 'film',
 'series',
 'find',
 'license',
 'product',
 'like',
 'naruto',
 'piece',
 'dragon',
 'ball',
 'tokyo',
 'revenger',
 'jujutsu',
 'kaisen',
 'hero',
 'academia',
 'attack',
 'titan',
 'figure',
 'pre',
 'orders',
 'shonen',
 'jump',
 'grocery',
 'store',
 'card']

In [37]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("models/d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [39]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("models/d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("nails and makeup".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.dv.most_similar('1')
print(similar_doc)

V1_infer [ 0.01901679 -0.02326235 -0.01486488 -0.07622939  0.02903481 -0.02660699
 -0.04435295  0.06561533 -0.06553958  0.02299719 -0.01823861 -0.02713016
 -0.00800014  0.02855443 -0.04989432  0.12953009  0.15741493 -0.09153008
 -0.08370349 -0.04304367]
[('369', 0.8170744776725769), ('109', 0.7968077063560486), ('14', 0.767487645149231), ('17', 0.7408545613288879), ('10683', 0.7389125227928162), ('453', 0.7371709942817688), ('155', 0.7367475032806396), ('438', 0.7342226505279541), ('5562', 0.7311685681343079), ('77', 0.7275729179382324)]
[  8.648717    -0.1841928    0.89728725  -2.0544097  -13.923196
  -3.5926192    0.05554559   3.2426772    0.51039183  -2.373517
  -0.5430173   -2.4189804   -3.249773     9.267743    -5.3887153
   1.321316    -0.6006352    0.06045065  -1.9508549   -6.913337  ]


In [57]:
sims = model.dv.most_similar([v1])
sims[0][0]

'2469'

In [51]:
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.dv['2469'])

[-3.6726642  -0.5523357  -5.6362147  -3.0152144  -1.697127   -0.6686374
 -8.458723    2.2264304  -5.142027    2.4961681   1.6965591  -2.029964
 -0.22389127  0.5296231  -3.954035    8.756813    8.198859   -7.137393
 -3.6503377  -4.5985374 ]


In [59]:
" ".join(tagged_data.__getitem__(int(sims[0][0])).words)

'online sale professional hairdressing beauty product hair face body find accessory utensil beauty care'