In [1]:
import re
import spacy
import gensim
import pandas as pd

In [2]:
# Google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Загрузка данных

In [3]:
text_file = 'drive/MyDrive/Colab Notebooks/Internship/lesson_21/dataset.txt'
# read all lines at once
with open(text_file, 'r', encoding= 'unicode_escape') as f:
    lines = f.readlines()
lines[2100:2103]

["'But it's never cared,' said Skimmer. 'Up until now. And now it wants to rip the top off the country and take what's underneath, mmph, mmhm.'\n",
 'Ah, thought Vimes, our killer clerk does have more than one emotion.\n',
 "'Ankh-Morpork has always tried to get on well with other nations,' said Sybil. 'Well, these days, at least.'\n"]

### Препроцесинг загруженного текста

In [4]:
# Function for preprocessing loaded text
def clean_text(text):
    # remove carriage return to next line (\n)
    text = re.sub(r'\n', '', text)
    # remove the transition to the beginning of the current line (\r)
    text = re.sub(r'\r', '', text)
    # remove tabs (\t)
    text = re.sub(r'\t', '', text)
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # returning cleared data
    return text  

In [5]:
# create a list to hold the cleared text
cleaned_text = []
# for each word from all lines
for text in lines:
    # cleaning up the text  
    text = clean_text(text)
    # add the cleared text to the list cleaned_text
    cleaned_text.append(text)
cleaned_text[2100:2103]

["'But it's never cared,' said Skimmer. 'Up until now. And now it wants to rip the top off the country and take what's underneath, mmph, mmhm.'",
 'Ah, thought Vimes, our killer clerk does have more than one emotion.',
 "'Ankh-Morpork has always tried to get on well with other nations,' said Sybil. 'Well, these days, at least.'"]

### Токенизация, лемматизация, укрупненный и уточненный POS-тэггинг, нахождение меток синтаксической зависимости с помощью библиотеки spaCy. Полученные результаты представлены в виде Pandas-датафрейма

In [6]:
nlp = spacy.load('en')
doc = nlp(' '.join(cleaned_text))

In [7]:
result_list = []
for token in doc:
    result_list.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_])
df = pd.DataFrame(result_list, columns=['Word', 'Lemma', 'POS', 'TAG', 'DEP'])
df

Unnamed: 0,Word,Lemma,POS,TAG,DEP
0,The,the,DET,DT,det
1,Fifth,Fifth,PROPN,NNP,amod
2,Elephant,Elephant,PROPN,NNP,compound
3,A,A,PROPN,NNP,compound
4,Discworld,Discworld,PROPN,NNP,compound
...,...,...,...,...,...
139459,whispered,whisper,VERB,VBD,ROOT
139460,.,.,PUNCT,.,punct
139461,,,SPACE,_SP,
139462,THE,the,DET,DT,det


### Удаление в датафрейме дубликатов строк, сохранение результирующей таблицы в формате XLSX

In [8]:
# remove duplicate lines
df_cleaned = df.drop_duplicates(keep='first', ignore_index=True)
df_cleaned

Unnamed: 0,Word,Lemma,POS,TAG,DEP
0,The,the,DET,DT,det
1,Fifth,Fifth,PROPN,NNP,amod
2,Elephant,Elephant,PROPN,NNP,compound
3,A,A,PROPN,NNP,compound
4,Discworld,Discworld,PROPN,NNP,compound
...,...,...,...,...,...
20203,claw,claw,NOUN,NN,pobj
20204,need,need,VERB,VBP,advcl
20205,sign,sign,NOUN,NN,attr
20206,'s,be,AUX,VBZ,acl


In [9]:
# save as XLSX
df_cleaned.to_excel('drive/MyDrive/Colab Notebooks/Internship/lesson_21/result.xlsx')

### Построение Word2Vec-модели на базе лемм с помощью библиотеки Gensim, демонстрация результатов её работы (similarity, most_similar, most_similar_cosmul и др.)

In [10]:
# create lemma list from token
lemma_list = []
for token in doc:
    lemma_list.append(token.lemma_)

In [11]:
# check 10 lemma
print(lemma_list[2100:2110])

['of', 'course', ',', '-PRON-', 'would', 'have', 'work', 'just', 'as', 'well']


In [12]:
# check the length of the list
print('Length: ', len(lemma_list))

Length:  139464


In [13]:
# building a Word2Vec model based on lemmas
model = gensim.models.Word2Vec(
    [lemma_list],
    negative = 10, # negative sampling how many "noise words" should be drawn
    iter = 100,
    min_count = 1, # ignores all words with total frequency lower than this
    window = 7, # maximum distance between the current and predicted word
    size = 40 # dimension of the word vector
    )

In [14]:
print(model.wv['elephant'])

[ 0.9070434   1.6772268  -1.5735987   2.4732692   0.00444325 -0.28212035
 -1.2173419  -0.9830175  -0.04345713  1.1465868  -0.01455431 -0.41312334
  1.6891828  -0.6349934   0.60966146 -0.6057399  -0.38276538  0.8523825
 -0.38825834  0.6336295   0.73003423  0.9548457   0.3124438   0.6101781
 -1.270658   -0.01822817 -0.63467103  0.3372522   1.6264573  -0.1467778
 -1.2684909  -0.4012239   0.24744898  1.4583752   0.22063929 -1.1871156
  2.3400567  -1.7114911  -0.3455359  -0.7046714 ]


In [15]:
print(model.wv['turtle'])

[ 0.21237469  0.4272008  -0.5214443   0.68625563  0.06511545 -0.08139799
 -0.23999488 -0.22123048  0.15391406  0.26535547 -0.05745091  0.08674093
  0.44982862 -0.18549122  0.07507398 -0.2135851  -0.13127053  0.21315213
 -0.03578937  0.01409248  0.07102496  0.2122642   0.01366168  0.08339494
 -0.2657091   0.07082091 -0.25706205  0.1275276   0.40581453  0.0094082
 -0.20491344 -0.07272362 -0.04639931  0.4023438  -0.19695495 -0.30724508
  0.50835574 -0.46540752 -0.08841686 -0.18962528]


In [16]:
# the result of the similarity method
model.wv.similarity('Mrs', 'lady')

0.71696115

In [17]:
model.wv.similarity('food', 'meal')

0.43646362

In [18]:
model.wv.similarity('dog', 'cat')

0.7763036

In [19]:
model.wv.similarity('machine', 'machine')

1.0

In [20]:
# the result of the most_similar method
print(model.wv.most_similar('wolf', topn=5))  # get similar words

[('exchange', 0.8923070430755615), ('splash', 0.8890494108200073), ('soggy', 0.8868609666824341), ('throw', 0.8729345798492432), ('crest', 0.8716309666633606)]


In [21]:
print(model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5))

[('truth', 0.6963328123092651), ('everyone', 0.6693935990333557), ('air', 0.6674009561538696), ('revolution', 0.6646387577056885), ('outdoor', 0.6598615050315857)]


In [22]:
# the result of the most_similar_cosmul method
print(model.wv.most_similar_cosmul('year', topn=5))

[('ago', 0.9408810138702393), ('fifteen', 0.9123499989509583), ('tree', 0.8791539072990417), ('both', 0.8725258708000183), ('tong', 0.8675180077552795)]


In [23]:
# the result of the n_similarity method
print(model.wv.n_similarity(['go', 'around', 'back',  'broken', 'window'], ['glass', 'break', 'front', 'door',  'open']))

0.8027682


In [24]:
# the result of the similar_by_word method
print(model.wv.similar_by_word('elephant', topn=5))

[('trumpet', 0.9677475094795227), ('giant', 0.9546517729759216), ('turtle', 0.949731707572937), ('scream', 0.9404006004333496), ('atmosphere', 0.9258747100830078)]


In [25]:
# the result of the doesnt_match method
print(model.wv.doesnt_match(['angry', 'elephant', 'scream', 'rock']))

rock


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
