In [32]:
corpus = "Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and respond to human languages in a way that is both meaningful and useful. This involves a range of computational techniques including text analysis, machine translation, and speech recognition."

In [33]:
import re

text = corpus.lower()
text = re.sub(r"[^a-z]"," ",text)
text = re.sub(r"\s+", " ", text)

In [34]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
sentences

['natural language processing nlp is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language the ultimate goal of nlp is to enable computers to understand interpret and respond to human languages in a way that is both meaningful and useful this involves a range of computational techniques including text analysis machine translation and speech recognition']

In [35]:
from nltk.tokenize import word_tokenize
words = [word_tokenize(sentence) for sentence in sentences]
print(words)

[['natural', 'language', 'processing', 'nlp', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', 'the', 'ultimate', 'goal', 'of', 'nlp', 'is', 'to', 'enable', 'computers', 'to', 'understand', 'interpret', 'and', 'respond', 'to', 'human', 'languages', 'in', 'a', 'way', 'that', 'is', 'both', 'meaningful', 'and', 'useful', 'this', 'involves', 'a', 'range', 'of', 'computational', 'techniques', 'including', 'text', 'analysis', 'machine', 'translation', 'and', 'speech', 'recognition']]


In [36]:
from nltk.corpus import stopwords
stopwords = stopwords.words("english")

words_wo_stopwords = [[word for word in sent if word not in stopwords] for sent in words]
print(words_wo_stopwords)

[['natural', 'language', 'processing', 'nlp', 'field', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans', 'natural', 'language', 'ultimate', 'goal', 'nlp', 'enable', 'computers', 'understand', 'interpret', 'respond', 'human', 'languages', 'way', 'meaningful', 'useful', 'involves', 'range', 'computational', 'techniques', 'including', 'text', 'analysis', 'machine', 'translation', 'speech', 'recognition']]


In [37]:
from gensim.models import Word2Vec

model = Word2Vec(words_wo_stopwords, min_count=1)

In [44]:
model.wv.key_to_index

{'natural': 0,
 'language': 1,
 'nlp': 2,
 'computers': 3,
 'interpret': 4,
 'processing': 5,
 'field': 6,
 'artificial': 7,
 'intelligence': 8,
 'focuses': 9,
 'interaction': 10,
 'humans': 11,
 'ultimate': 12,
 'goal': 13,
 'enable': 14,
 'understand': 15,
 'recognition': 16,
 'speech': 17,
 'human': 18,
 'languages': 19,
 'way': 20,
 'meaningful': 21,
 'useful': 22,
 'involves': 23,
 'range': 24,
 'computational': 25,
 'techniques': 26,
 'including': 27,
 'text': 28,
 'analysis': 29,
 'machine': 30,
 'translation': 31,
 'respond': 32}

In [43]:
model.wv[31]

array([-0.00219659, -0.00971296,  0.00929305,  0.00203607, -0.0011625 ,
       -0.00550674, -0.00851174, -0.00990463,  0.00894523, -0.00250099,
        0.00459389, -0.004521  ,  0.00996131,  0.0036573 ,  0.00102442,
       -0.00404092,  0.00121339, -0.00265223,  0.00735535,  0.00447684,
        0.00098931,  0.00348565,  0.00371415, -0.0067858 ,  0.00893691,
        0.00173353, -0.00579099,  0.00866208, -0.00129169,  0.00818622,
       -0.00150334,  0.00699166,  0.00273032, -0.00435728, -0.00375262,
        0.00919494,  0.00159305, -0.00600667,  0.00034951, -0.00196076,
        0.00158477, -0.0077145 ,  0.00738523,  0.00130883,  0.00787672,
        0.0044565 , -0.00439701,  0.00375543, -0.00063734, -0.00985269,
        0.0082434 ,  0.0096544 ,  0.0096539 , -0.00379708, -0.008449  ,
        0.00482512, -0.00765732,  0.00853388,  0.0027622 ,  0.00560622,
        0.00611605,  0.00046693, -0.00209293,  0.00077323,  0.0098339 ,
       -0.00712935, -0.00155713, -0.00236311,  0.00487026,  0.00

In [51]:
model.wv.most_similar('text')

[('field', 0.3041238486766815),
 ('natural', 0.19579482078552246),
 ('languages', 0.1889905482530594),
 ('techniques', 0.1672196239233017),
 ('recognition', 0.14186517894268036),
 ('focuses', 0.12706628441810608),
 ('machine', 0.11620384454727173),
 ('including', 0.0636691004037857),
 ('meaningful', 0.051461055874824524),
 ('artificial', 0.046723946928977966)]