In [1]:
import gc
import warnings
warnings.filterwarnings('ignore')
import nltk
# nltk.download('all')
gc.collect()

0

# Text Data Preprocessing

### 1. Tokenization Process. Paragraph to sentences

In [2]:
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''

In [3]:
sentences=nltk.sent_tokenize(para)
print('Total number of sentences in the above paragraph : ' , len(sentences))
print(sentences)

Total number of sentences in the above paragraph :  12
['Here are many examples of short stories for you to read online.', 'Online has become another leg in our life.', 'WE have to take that into account so that we will go along the growth of the science and technology.', 'Computer has revolutionalised our world.', 'The people have started to see another world.', 'What we were has become history.', 'The twentieth century has become remote history.', 'The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time.', 'Accuracy has become the most used word among the people.', 'Telecommunication has become very very cheap affair all over the world.', 'All these achievements are possible because of Computer and the Internet.', 'Reading short stories online has become our favorite pastime.']


### 2. Tokenization Process. Paragraph to Words

In [4]:
words= nltk.word_tokenize(para)
print('Total number of words in the above pragraph : ' , len(words))
print(words)

Total number of words in the above pragraph :  139
['Here', 'are', 'many', 'examples', 'of', 'short', 'stories', 'for', 'you', 'to', 'read', 'online', '.', 'Online', 'has', 'become', 'another', 'leg', 'in', 'our', 'life', '.', 'WE', 'have', 'to', 'take', 'that', 'into', 'account', 'so', 'that', 'we', 'will', 'go', 'along', 'the', 'growth', 'of', 'the', 'science', 'and', 'technology', '.', 'Computer', 'has', 'revolutionalised', 'our', 'world', '.', 'The', 'people', 'have', 'started', 'to', 'see', 'another', 'world', '.', 'What', 'we', 'were', 'has', 'become', 'history', '.', 'The', 'twentieth', 'century', 'has', 'become', 'remote', 'history', '.', 'The', 'IT', 'companies', 'and', 'other', 'computer-based', 'companies', 'have', 'outperformed', 'other', 'traditional', 'companies', 'which', 'have', 'been', 'there', 'for', 'a', 'long', 'time', '.', 'Accuracy', 'has', 'become', 'the', 'most', 'used', 'word', 'among', 'the', 'people', '.', 'Telecommunication', 'has', 'become', 'very', 'very',

### 3. Text Normalization : Convert the sentences into LOWER CASE, except for cases where UPPER case is necessary example : US/us etc
### 4. Stop word: is, a, am, are etc. Not required for processing so it should be removed from text

### 5. Stemming and Lemmatization: Stemming and Lemmatization both generate the root form of the inflected words. The difference is that stem might not be an actual word whereas, lemma is an actual language word. 
- Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma. Stemming follows an algorithm with steps to perform on the words which makes it faster.
- Lemmatization : The process of reducing the different forms of a word to one single form with the use of Dictionary. Slower processing than stemming

The main advantage of lemmatization is that it takes into consideration the context of the word to determine which is the intended meaning the user is looking for. This process allows to decrease noise and speed up the user's task.

# Stemming

In [5]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
sentences=nltk.sent_tokenize(para.lower())
stemmer=PorterStemmer()
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)
sentences # Stemming has been applied to it.

['mani exampl short stori read onlin .',
 'onlin becom anoth leg life .',
 'take account go along growth scienc technolog .',
 'comput revolutionalis world .',
 'peopl start see anoth world .',
 'becom histori .',
 'twentieth centuri becom remot histori .',
 'compani computer-bas compani outperform tradit compani long time .',
 'accuraci becom use word among peopl .',
 'telecommun becom cheap affair world .',
 'achiev possibl comput internet .',
 'read short stori onlin becom favorit pastim .']

### Main problem with stemming is that its does not provide proper words with meaning.  
### To overcome this, the alternative is lemmetization

# Lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
sentences=nltk.sent_tokenize(para.lower())
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)
sentences

['many example short story read online .',
 'online become another leg life .',
 'take account go along growth science technology .',
 'computer revolutionalised world .',
 'people started see another world .',
 'become history .',
 'twentieth century become remote history .',
 'company computer-based company outperformed traditional company long time .',
 'accuracy become used word among people .',
 'telecommunication become cheap affair world .',
 'achievement possible computer internet .',
 'reading short story online become favorite pastime .']

### Lemmatization overcomes the issue with stemming and provides words with correct context or meanings

# Bag of words Vecotization with stemming

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
ps=PorterStemmer()
wordnet=WordNetLemmatizer()
sentences=nltk.sent_tokenize(para)
stem_sentences=[]
for i in range(len(sentences)):
    review=re.sub('[^a-zA-Z]',' ',sentences[i]) # replace everything apart from a-z with spaces
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    stem_sentences.append(review)
print(stem_sentences)
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(stem_sentences).toarray()
print(X)

['mani exampl short stori read onlin', 'onlin becom anoth leg life', 'take account go along growth scienc technolog', 'comput revolutionalis world', 'peopl start see anoth world', 'becom histori', 'twentieth centuri becom remot histori', 'compani comput base compani outperform tradit compani long time', 'accuraci becom use word among peopl', 'telecommun becom cheap affair world', 'achiev possibl comput internet', 'read short stori onlin becom favorit pastim']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1
  0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0
  0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0

# Bag of words Vecotization with Lematization

In [9]:
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
ps=PorterStemmer()
wordnet=WordNetLemmatizer()
sentences=nltk.sent_tokenize(para)
lem_sentences=[]
for i in range(len(sentences)):
    review=re.sub('[^a-zA-Z]',' ',sentences[i]) # replace everything apart from a-z with spaces
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    lem_sentences.append(review)
print(lem_sentences)
cv=CountVectorizer(max_features=1500)
X1=cv.fit_transform(lem_sentences).toarray()
print(X1)

['many example short story read online', 'online become another leg life', 'take account go along growth science technology', 'computer revolutionalised world', 'people started see another world', 'become history', 'twentieth century become remote history', 'company computer based company outperformed traditional company long time', 'accuracy become used word among people', 'telecommunication become cheap affair world', 'achievement possible computer internet', 'reading short story online become favorite pastime']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0
  1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1
  0 0 0 0 0 0 0 0 

# TF-IDF

### To overcome disadvantages of bag of words, NLP has TF-IDF (term frequeny and inverse docuement frequency) alternative algo
- term frequency TF = (number of repetition of words in a sentence)/(total number of words in the sentence) 
- inverse docuement frequency IDF = log((number of sentences/number of sentences containing the words)) 
- TF vectorizes the words and IDF provides them with weight 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
# clean the text
sentences=nltk.sent_tokenize(para)
ps=PorterStemmer()
wn=WordNetLemmatizer()
review_stem=[]
review_lem=[]

# TFIDF with Stemming
for i in range(len(sentences)):
    review=re.sub('^a-zA-Z', ' ',sentences[i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    review_stem.append(review)
cv=TfidfVectorizer()
X=cv.fit_transform(review_stem).toarray()
print(X)
# TFIDF with Lemmatization
for i in range(len(sentences)):
    review=re.sub('^a-zA-Z', ' ',sentences[i])
    review=review.lower()
    review=review.split()
    review=[wn.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    review_lem.append(review)
cv=TfidfVectorizer()
X1=cv.fit_transform(review_lem).toarray()
print(X1)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.43799565 0.         0.         0.
  0.         0.         0.         0.         0.         0.43799565
  0.         0.43799565 0.         0.         0.         0.
  0.         0.37615575 0.         0.         0.         0.
  0.37615575 0.         0.37615575 0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.44097015 0.         0.28947705 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.51346552 0.51346552 0.         0.
  0.44097015 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.37796447 0

# Spam Classifier Model

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,roc_curve,auc, accuracy_score, fbeta_score,
roc_auc_score, brier_score_loss, plot_precision_recall_curve, average_precision_score)

def eval(actual, pred, prob=None):
    tn, fp, fn, tp = confusion_matrix(actual, pred).ravel()
    print("ACCURACY : "+str(round(accuracy_score(actual, pred), 5)))
    # if the model says positive, what percent of them are not positive
    print("FALSE POSITIVE RATE : " + str(round(fp/(tn+fp),5)))
    # if the model says positive, what percent of them are actually positive
    print("PRECISION : " + str(round(tp/(tp+fp),5)))
    # within the delinquent customers, what percentage did the model catch
    print("RECALL : " + str(round(tp/(tp+fn),5)))
    print("F(0.5) SCORE : " + str(round(fbeta_score(actual, pred, 0.5),5)))
    print("F(1) SCORE : " + str(round(fbeta_score(actual, pred, 1),5)))
    if prob is not None:
        print("AUC : " + str(round(roc_auc_score(actual,prob),5)))



In [13]:
import pandas as pd
messeges=pd.read_csv(r'C:\Users\jaysriva\Documents\Learning\NLP\SpamCollectionDataset\SMSSpamCollection', sep='\t', names=['label','messeges'])
messeges.head()

Unnamed: 0,label,messeges
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 1. TF-IDF with Lemmatization

In [14]:
# Label column
y=pd.get_dummies(messeges.label) # Spam-ham column. selecting only spam column, 0 is ham, 1 is spam the dependent feature
y=y.iloc[:,1].values 

# Data cleaning and Preprocessing
ps=PorterStemmer()
wn=WordNetLemmatizer()
review_lem=[]

for i in range(len(messeges)):
    review=re.sub('^a-zA-Z', ' ', messeges.messeges[i])
    review=review.lower()
    review=review.split()
    review=[wn.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    review_lem.append(review)

In [15]:
tv=TfidfVectorizer(max_features=5000) # Top frequent 5000 words from review
X=tv.fit_transform(review_lem).toarray()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=369)
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)
print('Model Performance with Tf-IDF, Lemmatization - ')
eval(y_test,y_pred)

Model Performance with Tf-IDF, Lemmatization - 
ACCURACY : 0.98296
FALSE POSITIVE RATE : 0.0
PRECISION : 1.0
RECALL : 0.87075
F(0.5) SCORE : 0.97117
F(1) SCORE : 0.93091


### 2. Bag of Words with Lemmatization

In [16]:
tv=TfidfVectorizer(max_features=5000) # Top frequent 5000 words from review
X=tv.fit_transform(review_lem).toarray()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=369)
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)
print('Model Performance with BOW, Lemmatization - ')
eval(y_test,y_pred)

Model Performance with BOW, Lemmatization - 
ACCURACY : 0.98296
FALSE POSITIVE RATE : 0.0
PRECISION : 1.0
RECALL : 0.87075
F(0.5) SCORE : 0.97117
F(1) SCORE : 0.93091


### 3. TF-IDF with Stemming

In [18]:
# Label column
y=pd.get_dummies(messeges.label) # Spam-ham column. selecting only spam column, 0 is ham, 1 is spam the dependent feature
y=y.iloc[:,1].values 

# Data cleaning and Preprocessing
ps=PorterStemmer()
review_stem=[]

for i in range(len(messeges)):
    review=re.sub('^a-zA-Z', ' ', messeges.messeges[i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    review_stem.append(review)


### 4. Bag of Words with Stemming

In [19]:
cv=CountVectorizer(max_features=5000) # Top frequent 5000 words from review
X=cv.fit_transform(review_stem).toarray()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=369)
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)
print('Model Performance with Tf-IDF, Stemming- ')
eval(y_test,y_pred)

Model Performance with Tf-IDF- 
ACCURACY : 0.99103
FALSE POSITIVE RATE : 0.00517
PRECISION : 0.96599
RECALL : 0.96599
F(0.5) SCORE : 0.96599
F(1) SCORE : 0.96599


In [21]:
tv=TfidfVectorizer(max_features=5000) # Top frequent 5000 words from review
X=tv.fit_transform(review_stem).toarray()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=369)
spam_detect_model=MultinomialNB().fit(X_train,y_train)
y_pred=spam_detect_model.predict(X_test)
print('Model Performance with BOW, Stemming- ')
eval(y_test,y_pred)

Model Performance with BOW, Stemming- 
ACCURACY : 0.98206
FALSE POSITIVE RATE : 0.0
PRECISION : 1.0
RECALL : 0.86395
F(0.5) SCORE : 0.96947
F(1) SCORE : 0.92701


### Issue with BOW and TFIDF: 
- 1. no symantic info is stored (symantic info : order, relationship etc)
- 2. Chances of overfitting

###  To overcome above issue word2vec is used in which word is represented as a vector of 32 or more dimentions. and symantic information and relations between different words is also preserved
- Word2Vec can be used to find out the relations between words in a dataset, compute the similarity between them, or use the vector representation of those words as input for other applications such as text classification or clustering.

# Word2vec

In [22]:
! pip install gensim



In [23]:
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re
para='''Here are many examples of short stories for you to read online. Online has become another leg in our life. WE have to take that into account so that we will go along the growth of the science and technology. Computer has revolutionalised our world. The people have started to see another world. What we were has become history. The twentieth century has become remote history. The IT companies and other computer-based companies have outperformed other traditional companies which have been there for a long time. Accuracy has become the most used word among the people. Telecommunication has become very very cheap affair all over the world. All these achievements are possible because of Computer and the Internet. Reading short stories online has become our favorite pastime.'''
text=re.sub('^a-zA-Z',' ',para)
text=re.sub(r'\[[0-9]*\]',' ',text)
text=re.sub(r'\s+',' ',text)
text=text.lower()
text=re.sub(r'\d',' ',text)
text=re.sub(r'\s+',' ',text)
sentences=nltk.sent_tokenize(text)
sentences=[nltk.word_tokenize(sentence) for sentence in sentences]
for i in range(len(sentences)):
    sentences[i]=[word for word in sentences[i] if word not in set(stopwords.words('english'))]
model=Word2Vec(sentences,min_count=1) # min_count is word count. If word count is 1 (present), do this


In [24]:
words=model.wv.key_to_index
words

{'.': 0,
 'become': 1,
 'world': 2,
 'online': 3,
 'companies': 4,
 'history': 5,
 'short': 6,
 'stories': 7,
 'another': 8,
 'computer': 9,
 'people': 10,
 'growth': 11,
 'revolutionalised': 12,
 'technology': 13,
 'science': 14,
 'pastime': 15,
 'along': 16,
 'go': 17,
 'account': 18,
 'life': 19,
 'leg': 20,
 'read': 21,
 'examples': 22,
 'take': 23,
 'see': 24,
 'started': 25,
 'favorite': 26,
 'reading': 27,
 'internet': 28,
 'possible': 29,
 'achievements': 30,
 'affair': 31,
 'cheap': 32,
 'telecommunication': 33,
 'among': 34,
 'word': 35,
 'used': 36,
 'accuracy': 37,
 'time': 38,
 'long': 39,
 'traditional': 40,
 'outperformed': 41,
 'computer-based': 42,
 'remote': 43,
 'century': 44,
 'twentieth': 45,
 'many': 46}

In [25]:
vector=model.wv['growth']
vector

array([ 9.7702928e-03,  8.1651136e-03,  1.2809705e-03,  5.0975773e-03,
        1.4081288e-03, -6.4551616e-03, -1.4280510e-03,  6.4491653e-03,
       -4.6173073e-03, -3.9930656e-03,  4.9244044e-03,  2.7130984e-03,
       -1.8479753e-03, -2.8769446e-03,  6.0107303e-03, -5.7167388e-03,
       -3.2367038e-03, -6.4878250e-03, -4.2346334e-03, -8.5809948e-03,
       -4.4697905e-03, -8.5112313e-03,  1.4037776e-03, -8.6181974e-03,
       -9.9166557e-03, -8.2016252e-03, -6.7726658e-03,  6.6805840e-03,
        3.7845564e-03,  3.5616636e-04, -2.9579829e-03, -7.4283220e-03,
        5.3341867e-04,  4.9989222e-04,  1.9561767e-04,  8.5259438e-04,
        7.8633073e-04, -6.8161491e-05, -8.0070542e-03, -5.8702733e-03,
       -8.3829118e-03, -1.3120436e-03,  1.8206357e-03,  7.4171280e-03,
       -1.9634271e-03, -2.3252917e-03,  9.4871549e-03,  7.9703328e-05,
       -2.4045228e-03,  8.6048460e-03,  2.6870037e-03, -5.3439736e-03,
        6.5881060e-03,  4.5101522e-03, -7.0544672e-03, -3.2317400e-04,
      

In [26]:
similar=model.wv.most_similar('growth')
similar

[('achievements', 0.18214833736419678),
 ('world', 0.17358534038066864),
 ('history', 0.1671186238527298),
 ('cheap', 0.15626852214336395),
 ('account', 0.13278967142105103),
 ('long', 0.12205980718135834),
 ('accuracy', 0.12147410213947296),
 ('leg', 0.11179191619157791),
 ('stories', 0.11127692461013794),
 ('people', 0.10941852629184723)]

In [27]:
model.wv.most_similar('science')

[('go', 0.24673253297805786),
 ('remote', 0.18985602259635925),
 ('online', 0.1783706247806549),
 ('take', 0.171890988945961),
 ('used', 0.17061734199523926),
 ('life', 0.16160978376865387),
 ('become', 0.1608772575855255),
 ('computer-based', 0.11331520229578018),
 ('reading', 0.10772479325532913),
 ('stories', 0.10574804991483688)]