In [1]:
ru_text = """
Привет, меня зовут Тушин Кирилл,
я родился 26.09.1997, закончил МФТИ.
Я пытаюсь освоить курс по анализу текста!
Тестовый телефон: 89876543210
Тестовая почта: myemail@email.com
Тестовая сумма денег: 123456$
😎 🔥 ✌️
"""

en_text = """
Hi, my name is Kirill Tushin,
I was born on 26.09.1997, graduated from MIPT.
I'm trying to master a course on text analysis!
Test phone: 89876543210
Test mail: myemail@email.com
Test amount of money: 123456$
😎  🔥  ✌️
"""

In [2]:
def clean_dataset(dataset):
    dataset = [string.strip() for string in dataset]
    dataset = [string for string in dataset if string]
    return dataset


with open('../../data/war_and_peace_ru.txt', 'r') as ru_dataset_file:
    ru_dataset = ru_dataset_file.readlines()
    ru_dataset = clean_dataset(ru_dataset)
    
with open('../../data/the_picture_of_dorian_gray.txt', 'r') as en_dataset_file:
    en_dataset = en_dataset_file.readlines()
    en_dataset = clean_dataset(en_dataset)


full_dataset = ru_dataset + en_dataset

# Word2Vec

In [3]:
import gensim

## Train

In [4]:
%%time

ru_list_of_tokens = [x.split() for x in ru_dataset]

ru_word2vec_model = gensim.models.Word2Vec(
    ru_list_of_tokens,  # Dataset
    vector_size=150,  # Embedding Dim
    window=10,  # Window for neighbors
    min_count=2,  # Threshold to add word into vocabulary
    epochs=10,  # Number of epochs to train model
)

CPU times: user 5.81 s, sys: 132 ms, total: 5.95 s
Wall time: 2.56 s


## Save

In [5]:
ru_word2vec_model.save('ru_word2vec_model')

## Load

In [6]:
ru_word2vec_model = gensim.models.Word2Vec.load('ru_word2vec_model')

## Usage

### Word Embedding

In [7]:
ru_word2vec_model.wv['мама']

array([ 3.02021001e-02, -4.82140528e-03, -4.13925340e-03,  2.70679686e-02,
       -1.66919397e-03,  6.85095508e-03, -3.90300207e-04,  1.26281232e-01,
       -3.41660739e-03, -1.91493668e-02,  4.54541035e-02,  2.42535863e-02,
       -1.18106321e-01,  3.95500734e-02, -6.37374967e-02,  7.38273701e-03,
        7.12082982e-02,  6.91025984e-04, -6.13153772e-03,  7.93697312e-02,
       -5.67051657e-02,  2.69916747e-02,  8.44411850e-02,  5.57372943e-02,
        1.61672309e-02, -6.01726063e-02, -1.41567560e-02, -2.79626828e-02,
       -1.33444369e-02, -1.04471810e-01, -6.00705855e-02,  1.29980799e-02,
       -4.11855849e-03, -1.00049619e-02, -2.36125216e-02,  2.51449738e-02,
        8.32455456e-02, -1.26472441e-02,  3.27564031e-02, -9.10971165e-02,
        5.49898110e-03,  4.48284075e-02, -7.41146803e-02, -4.46553938e-02,
        5.62046580e-02,  3.01241092e-02, -6.19156398e-02, -4.06440385e-02,
       -9.26182605e-03,  8.85270536e-02,  1.31184179e-02,  2.34710239e-02,
       -1.69672184e-02,  

In [8]:
ru_word2vec_model.wv['dbfhsijokpal[;dksmfjnbhdekopfd]']

KeyError: "Key 'dbfhsijokpal[;dksmfjnbhdekopfd]' not present"

### Similarity

In [9]:
ru_word2vec_model.wv.most_similar(
    positive=['мама'],
    negative=['папа'],
)

[('горизонт', 0.938174843788147),
 ('адъютантском', 0.9327242374420166),
 ('орденах,', 0.9326286911964417),
 ('стены.', 0.9288221597671509),
 ('портрет', 0.9256203770637512),
 ('отоманке', 0.9229714274406433),
 ('поражал', 0.9194163084030151),
 ('пятна,', 0.9162595868110657),
 ('главнокомандующих', 0.9152465462684631),
 ('глядят', 0.910675048828125)]

In [10]:
ru_word2vec_model.wv.similarity(
    w1='мама',
    w2='папа',
)

0.76599634

In [11]:
ru_word2vec_model.wv.similarity(
    w1='богатый',
    w2='князь',
)

0.32267654

### Word Match

In [12]:
ru_word2vec_model.wv.doesnt_match(['люди', 'закон', 'крестьяне', 'земля', 'князь'])

'князь'

### Word Movement Distance

In [13]:
ru_word2vec_model.wv.wmdistance(['крестьяне', 'земля'], ['князь', 'богатсвтво'])

1.2986247539520264

In [14]:
ru_word2vec_model.wv.wmdistance(['крестьяне', 'земля'], ['обед', 'ужин'])

0.5155532816905976

In [15]:
ru_word2vec_model.wv.wmdistance(['крестьяне', 'земля'], ['бедность', 'нищета'])

inf

# FastText

## Train

In [16]:
%%time

ru_list_of_tokens = [x.split() for x in ru_dataset]

ru_fasttext_model = gensim.models.FastText(
    ru_list_of_tokens,  # Dataset
    vector_size=150,  # Embedding Dim
    window=10,  # Window for neighbors
    min_count=2,  # Threshold to add word into vocabulary
    epochs=10,  # Number of epochs to train model
)

CPU times: user 42.9 s, sys: 603 ms, total: 43.5 s
Wall time: 18 s


## Save

In [17]:
ru_fasttext_model.save('ru_fasttext_model')

## Load

In [18]:
ru_fasttext_model = gensim.models.Word2Vec.load('ru_fasttext_model')

## Usage

### Word Embedding

In [19]:
ru_fasttext_model.wv['мама']

array([ 0.31411287,  0.23189479, -0.05290234,  0.19873315, -0.04629235,
        0.37327373,  0.4277613 ,  0.23273039, -0.30199024, -0.38448507,
        0.17578235, -0.10291545, -0.05303003,  0.38595143,  0.09630149,
       -0.07306317, -0.19369277, -0.2989907 ,  0.28131506,  0.17944105,
        0.14875956,  0.01910911,  0.44533542,  0.49348515,  0.18903035,
        0.17017512, -0.14187588, -0.13279122,  0.14810525, -0.1644329 ,
       -0.5556809 , -0.33517686,  0.16557693, -0.15214342, -0.32673952,
        0.17141977,  0.27005282, -0.29150188,  0.03150325, -0.22757028,
       -0.08539229,  0.28600773, -0.08683836, -0.39011535,  0.05625081,
        0.14162959,  0.1188933 ,  0.37384233, -0.36189452,  0.00820126,
        0.30949584,  0.27978274, -0.07966378, -0.38477677, -0.02372113,
       -0.09463379, -0.20705141,  0.10458355, -0.19246587,  0.10763201,
       -0.01599042, -0.28068706,  0.00595285, -0.2605104 , -0.06117827,
       -0.05936556, -0.2937391 ,  0.12864803,  0.16759634, -0.10

In [20]:
ru_fasttext_model.wv['dbfhsijokpal[;dksmfjnbhdekopfd]']

array([ 3.85075546e-04,  3.01149837e-03,  3.17327795e-05,  1.94420456e-03,
        6.97697862e-04,  2.02145963e-03,  3.80287832e-03,  2.83198408e-03,
       -1.47738191e-03, -4.02460713e-03, -1.22581038e-03, -7.63160133e-05,
       -1.38173962e-03,  4.10432462e-03,  2.21307483e-03,  4.59163624e-04,
       -2.73794495e-03, -4.64609964e-03,  1.93261309e-03,  6.83820341e-04,
        3.44210188e-04, -1.85020291e-03,  2.68869265e-03,  3.96884000e-03,
        3.01748211e-03,  3.63865797e-03, -2.20789062e-03, -2.29436625e-03,
        2.51563103e-03, -1.72434840e-03, -4.36912570e-03, -3.40993004e-03,
        1.08170370e-03, -1.37592526e-03, -4.18728497e-03,  1.00846600e-03,
        1.63495087e-03, -2.74242461e-03, -1.42862977e-04, -2.79138658e-05,
        1.67191285e-03,  2.49654101e-03, -3.64220061e-04, -3.33231804e-03,
       -9.92840389e-04,  1.12830254e-03,  1.31829269e-03,  5.62148576e-04,
       -3.08948336e-03,  1.09364756e-03,  1.19569537e-03,  1.10883696e-03,
        2.80107102e-08, -

### Similarity

In [21]:
ru_fasttext_model.wv.most_similar(
    positive=['мама'],
    negative=['папа'],
)

[('Сказала', 0.7283618450164795),
 ('сказала', 0.6995818018913269),
 ('указала', 0.6820538640022278),
 ('выказала', 0.6556598544120789),
 ('отказала', 0.6369907259941101),
 ('сказала,', 0.6306861042976379),
 ('сказала…', 0.630251944065094),
 ('сказала:', 0.6161987781524658),
 ('мила', 0.6053548455238342),
 ('думала', 0.6015428900718689)]

In [22]:
ru_fasttext_model.wv.similarity(
    w1='мама',
    w2='папа',
)

0.9872185

In [23]:
ru_fasttext_model.wv.similarity(
    w1='богатый',
    w2='князь',
)

0.21144028

### Word Match

In [24]:
ru_fasttext_model.wv.doesnt_match(['люди', 'закон', 'крестьяне', 'земля', 'князь'])

'князь'

### Word Movement Distance

In [25]:
ru_fasttext_model.wv.wmdistance(['крестьяне', 'земля'], ['князь', 'богатсвтво'])

0.7998454934310913

In [26]:
ru_word2vec_model.wv.wmdistance(['крестьяне', 'земля'], ['обед', 'ужин'])

0.5155532816905976

In [27]:
ru_fasttext_model.wv.wmdistance(['крестьяне', 'земля'], ['бедность', 'нищета'])

0.45154384180575613

# Facebook FastText

In [28]:
import fasttext

## Train

In [29]:
%%time

ru_fasttext_facebook_model = fasttext.train_unsupervised(
    '../../data/war_and_peace_ru.txt',  # Dataset
    dim=150,  # Embedding Dim
    minn=3,  # Min len of subword
    maxn=6,  # Max len of subword
    epoch=10,  # Number of epochs to train model
    lr=0.1,  # Learning rate to train model
)

bad_model = fasttext.train_unsupervised(
    '../../data/war_and_peace_ru.txt',  # Dataset
    dim=150,  # Embedding Dim
    minn=0,  # Min len of subword
    maxn=0,  # Max len of subword
    epoch=10,  # Number of epochs to train model
    lr=0.1,  # Learning rate to train model
)

CPU times: user 1min 24s, sys: 1.47 s, total: 1min 26s
Wall time: 17.3 s


## Save

In [30]:
ru_fasttext_facebook_model.save_model('ru_fasttext_facebook_model')

## Load

In [31]:
ru_fasttext_facebook_model = fasttext.load_model("ru_fasttext_facebook_model")



## Usage

### Word Embedding

In [32]:
ru_fasttext_facebook_model['мама']

array([-0.22573905,  0.09062721,  0.20082895,  0.09735801,  0.14006566,
        0.16550614, -0.2806072 , -0.18519297,  0.3628306 , -0.32520467,
        0.20114017, -0.58898336, -0.18697718, -0.3127598 ,  0.03013002,
       -0.03638166,  0.10704143, -0.66770947,  0.4702613 , -0.3393418 ,
        0.5046367 , -0.5005997 , -0.1508333 ,  0.05985669,  0.01231881,
        0.11452363,  0.01655716,  0.15336166,  0.24988034,  0.18887183,
        0.24877396, -0.12596226, -0.34023768,  0.38373804, -0.20339887,
       -0.2773579 ,  0.04374941, -0.01963592,  0.05784079,  0.39425132,
        0.19168611,  0.13892962, -0.40764856,  0.25078484, -0.03839865,
       -0.01385576,  0.05467789, -0.04954016,  0.29946375,  0.05662804,
       -0.38291392, -0.12723361,  0.29473165, -0.21876808,  0.43953773,
       -0.07155332, -0.49646017, -0.16473792,  0.10973284, -0.00479851,
        0.19497405,  0.36801225, -0.30600533, -0.3598574 , -0.02709599,
        0.10158551,  0.06304318, -0.12041765, -0.3047463 ,  0.16

In [33]:
ru_fasttext_facebook_model['hgvbjknlm;,']

array([ 4.31367988e-03, -7.19424756e-03,  5.30151697e-03, -8.55950173e-03,
       -1.21310661e-02,  1.13120554e-02, -7.33871525e-03,  1.42786456e-02,
       -1.18549131e-02, -9.22034960e-03, -4.79134358e-03, -2.26404183e-02,
       -2.37308815e-03, -6.00447413e-03, -9.40568466e-03, -2.48440308e-03,
        4.70446364e-04, -7.96907675e-03,  8.24849214e-03,  4.03395807e-03,
       -7.85911363e-03, -1.04601877e-02,  1.22022964e-02,  5.90315554e-04,
       -3.58045613e-03, -4.60239453e-03, -8.30739643e-03, -1.64318120e-03,
       -3.64115136e-03,  1.19967363e-03, -6.33411482e-03,  1.90849509e-03,
       -5.89067675e-03,  1.46455815e-04,  1.48940701e-02,  3.27771937e-04,
        5.49447443e-03,  1.36960447e-02, -8.25620897e-04,  7.01746019e-03,
        8.52918369e-04,  8.64018686e-03,  1.65678859e-02,  2.19676625e-02,
        1.32994354e-02, -6.21814514e-03, -1.16556166e-02, -9.80300177e-03,
       -3.77743854e-03, -4.33428539e-03,  5.01327543e-03, -7.36226840e-03,
        1.24999741e-02, -

In [34]:
bad_model['hgvbjknlm;,']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)

### Similarity

In [35]:
bad_model.get_nearest_neighbors('люди')

[(0.46329864859580994, 'молодые'),
 (0.44947105646133423, 'Люди'),
 (0.4274621903896332, 'какому'),
 (0.42561104893684387, 'люди,'),
 (0.38691502809524536, 'многие'),
 (0.382607102394104, 'отыскивать'),
 (0.3809645175933838, 'делаются'),
 (0.37505224347114563, 'зло'),
 (0.3722766935825348, 'обществу,'),
 (0.36668944358825684, 'крыльце')]

In [36]:
ru_fasttext_facebook_model.get_nearest_neighbors('люди')

[(0.7033798098564148, 'Люди'),
 (0.6967329382896423, 'люди,'),
 (0.41316723823547363, 'молодые'),
 (0.40490251779556274, 'людям'),
 (0.39096182584762573, 'члены'),
 (0.3801904618740082, 'любят'),
 (0.3751494884490967, 'нахмурившись'),
 (0.3729250133037567, 'делаются'),
 (0.36756178736686707, 'редко'),
 (0.3623075485229492, 'сидели,')]

### Word Analogies

In [37]:
# wordA - wordB + wordC

ru_fasttext_facebook_model.get_analogies('мама', 'папа',  'муж')

[(0.5121732950210571, 'мужем'),
 (0.46543267369270325, 'мужем,'),
 (0.45562535524368286, 'мужа,'),
 (0.43418294191360474, 'мужу.'),
 (0.4292517602443695, 'жена'),
 (0.42545434832572937, 'мужа.'),
 (0.41467392444610596, 'репутацию'),
 (0.4101126492023468, 'дочери.'),
 (0.39722657203674316, 'ангела'),
 (0.3921177387237549, 'дама')]

### Sentence Vector

In [38]:
# An important thing!
ru_text_with_replaced_back_slash_n = ru_text.replace('\n', ' ')

ru_fasttext_facebook_model.get_sentence_vector(ru_text_with_replaced_back_slash_n)

array([ 6.0718916e-03,  2.0929972e-02, -1.6163753e-02,  5.0170563e-02,
       -5.0120942e-02,  2.2582600e-02,  3.8741976e-03,  1.7326303e-02,
        1.1078587e-02, -8.8962656e-04,  6.7996375e-02,  3.9925281e-02,
        3.1967115e-02, -4.5510503e-03,  6.1684061e-02,  2.1710295e-02,
        5.4149579e-02, -3.6655027e-02,  1.6128417e-02, -2.2948069e-02,
        4.2405922e-02,  1.7332172e-02, -7.0304431e-02,  6.2627224e-03,
        3.8002287e-03, -4.3780845e-02, -7.2021499e-02,  1.5430346e-02,
       -1.0349106e-02,  1.1373614e-02, -2.4150473e-03, -1.6092256e-02,
        1.3575211e-02,  4.6125680e-02,  8.0780834e-03, -5.5410601e-02,
       -4.4158030e-02,  1.3147298e-02,  2.8396728e-02,  5.8240477e-02,
        1.8601881e-02,  2.2354633e-02, -1.3716841e-02, -2.4135681e-02,
        1.9760614e-02,  2.5483822e-02,  2.0223777e-03,  4.1687083e-03,
       -3.4163080e-03,  1.7220153e-02, -2.9991642e-02,  1.1045831e-03,
        1.7747594e-04,  4.3186855e-02, -1.7950542e-02, -4.2334065e-02,
      

## Text Classification

In [39]:
## Task -> Long/Short text?

ru_fasttext_facebook_supervised_model = fasttext.train_supervised(
    '../../data/war_and_peace_ru_supervised.txt',  # Dataset
    dim=150,  # Embedding Dim
    minn=3,  # Min len of subword
    maxn=6,  # Max len of subword
    epoch=10,  # Number of epochs to train model
    lr=0.1,  # Learning rate to train model
)

In [40]:
ru_fasttext_facebook_supervised_model.predict('привет, как дела?')

(('__label__short',), array([0.96955568]))

In [41]:
ru_fasttext_facebook_supervised_model.predict(ru_text_with_replaced_back_slash_n)

(('__label__long',), array([0.70904726]))