# Description of notebook

### A: Preprocessing demonstration
* Corpus 1: Jane Austen's pride and prejudice
* Corpus 2: 20 Newsgroups corpus

### B: Preprocessing detailed (step by step)

### C: Word2Vec model

# A: Preprocessing

In [1]:
import os
import pandas as pd
import re 
import string 

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import gensim
# from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

### Corpus 1

Obtaining text

In [2]:
with open('../HW1/Pride and Prejudice - Jane Austen Chapter 1 to 20.txt') as f:
    corpus_1 = f.read()

In [3]:
print(corpus_1[0:500])

Chapter 1

      It is a truth universally acknowledged, that a single man in
      possession of a good fortune, must be in want of a wife.

      However little known the feelings or views of such a man may be
      on his first entering a neighbourhood, this truth is so well
      fixed in the minds of the surrounding families, that he is
      considered as the rightful property of some one or other of their
      daughters.

      “My dear Mr. Bennet,” said his lady to him one day, “have yo


Cleaning

In [4]:
def preprocessing_corpus_1(corpus):
    
    '''Takes string from novel, normalizes and tokenizes into sentences'''
    
    #Removal of white space
    step_1 = re.sub('\s+', ' ', corpus)
    # Removal of chapter tags
    step_2 = re.sub(r'Chapter \d* ', '', step_1)
    # Removal of digits
    step_3 = re.sub(r'\d+', '', step_2)
    # Remove other signs
    step_4 = re.sub(r'["_“”\'\`\-\*\(\)]','',step_3)
    #Tokenize to sentences while punctuation is still in place
    tokens_jane_austen = sent_tokenize(step_4)
    # Convert each token to lowercase
    lower_token = list(map(lambda token: token.lower(), tokens_jane_austen))
    # Remove punctuation from lowercase token
    punct_less_token = list(map(lambda token: 
                            token.translate(str.maketrans('', '', string.punctuation)), lower_token))
    
    return punct_less_token

In [5]:
processed_1 = preprocessing_corpus_1(corpus_1)
processed_1[0:10]

['it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife',
 'however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered as the rightful property of some one or other of their daughters',
 'my dear mr bennet said his lady to him one day have you heard that netherfield park is let at last',
 'mr bennet replied that he had not',
 'but it is returned she for mrs long has just been here and she told me all about it',
 'mr bennet made no answer',
 'do not you want to know who has taken it',
 'cried his wife impatiently',
 'you want to tell me and i have no objection to hearing it',
 'this was invitation enough']

Writing to text file - including spacing between each

In [6]:
textfile = open("corpus_1_text_file.txt", "w")
for element in processed_1:
    textfile.write(element + "\n" + "\n")
textfile.close()

### Corpus 2:

Obtaining text

In [7]:
files_list = []
path = os.getcwd() + '/20news-bydate-train/'
for root, dirs, files in os.walk(path, topdown = False):
    for name in files:
        files_list.append(os.path.join(root, name))

corpus_2_list = []
for i in files_list:
    with open(i, 'r', encoding="utf8", errors="ignore") as f:
        file = f.read()
        corpus_2_list.append(file)

In [60]:
corpus_2_list[1000]

'From: zowie@daedalus.stanford.edu (Craig "Powderkeg" DeForest)\nSubject: Re: 5W30, 10W40, or 20W50\nArticle-I.D.: daedalus.ZOWIE.93Apr5215616\nOrganization: Stanford Center for Space Science and Astrophysics\nLines: 37\nNNTP-Posting-Host: daedalus.stanford.edu\nIn-reply-to: Brad Thone\'s message of Fri, 02 Apr 93 21:41:53 CST\n\nIn article <foo> Brad Thone <C09615BT@WUVMD> writes:\nWell, there *is* a difference.\n\nI don\'t happen to have my SAE manual handy, but oil viscosity in general\n_decreases_ with temperature.  The SAE numbers are based on a `typical\'\ncurve that oils used to all have, running from (say) the viscosity of a\nroom-temperature 90-weight at 0C, down to (say) that of a room-temperature \n5-weight at 20C, for a typical 40-weight oil.\n\nOils that are designed for operation in `normal\' temperatures just have\na weight specification.  Oils that are designed for operation in exceedingly\ncold temperatures have a `W\' tacked on the end, so in winter in a cold\nplace, 

Cleaning

In [61]:
def preprocessing_corpus_2(corpus):
    
    '''Takes string from news corpus, normalizes and tokenizes into sentences'''
    
    #Removal of white space
    step_1 = re.sub('\s+', ' ', corpus)
    # RFrom/To and subject line of email
    step_2 = re.sub(r'\bFrom: .*? writes: ', '', step_1)
    # Removal of digits
    step_3 = re.sub(r'\d+', '', step_2)
    # Remove other signs
    step_4 = re.sub(r'["_“”\'\`\-\*\(\)]','',step_3)
    #Tokenize to sentences while punctuation is still in place
    tokens_news = sent_tokenize(step_4)
    # Convert each token to lowercase
    lower_token = list(map(lambda token: token.lower(), tokens_news))
    # Remove punctuation from lowercase token
    punct_less_token = list(map(lambda token: 
                            token.translate(str.maketrans('', '', string.punctuation)), lower_token))
    
    return punct_less_token 

In [62]:
preprocessing_corpus_2(corpus_2_list[1000])

['well there is a difference',
 'i dont happen to have my sae manual handy but oil viscosity in general decreases with temperature',
 'the sae numbers are based on a typical curve that oils used to all have running from say the viscosity of a roomtemperature weight at c down to say that of a roomtemperature weight at c for a typical weight oil',
 'oils that are designed for operation in normal temperatures just have a weight specification',
 'oils that are designed for operation in exceedingly cold temperatures have a w tacked on the end so in winter in a cold place youd stick w in your car in the winter and  in it in the summer to approximate the appropriate viscosity throughout the year',
 'modern multiviscosity oils change viscosity much less with temperature',
 'as a result their viscosity graphs cross over several curves',
 'a multivis specification pegs the curve at two temperatures a normal operating temperature and a cold one though i cant remember the numbers',
 'in any event 

Writing to text file - including spacing between each

In [63]:
textfile = open("corpus_2_text_file.txt", "w")
for news_story in corpus_2_list:
    tokenized_story = preprocessing_corpus_2(news_story)
    for sentence in tokenized_story:
        textfile.write(sentence + "\n" + "\n")
textfile.close()

# B: Preprocessing detailed

### Corpus 1

__Book before cleaning__

In [64]:
corpus_1[0:1000]

'Chapter 1\n\n      It is a truth universally acknowledged, that a single man in\n      possession of a good fortune, must be in want of a wife.\n\n      However little known the feelings or views of such a man may be\n      on his first entering a neighbourhood, this truth is so well\n      fixed in the minds of the surrounding families, that he is\n      considered as the rightful property of some one or other of their\n      daughters.\n\n      “My dear Mr. Bennet,” said his lady to him one day, “have you\n      heard that Netherfield Park is let at last?”\n\n      Mr. Bennet replied that he had not.\n\n      “But it is,” returned she; “for Mrs. Long has just been here, and\n      she told me all about it.”\n\n      Mr. Bennet made no answer.\n\n      “Do not you want to know who has taken it?” cried his wife\n      impatiently.\n\n      “_You_ want to tell me, and I have no objection to hearing it.”\n\n      This was invitation enough.\n\n      “Why, my dear, you must know, Mrs. Lo

__Removal of white space__

In [65]:
# Remove white space and new lines
step_1 = re.sub('\s+', ' ', corpus_1)
step_1[0:1000]

'Chapter 1 It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. “My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?” Mr. Bennet replied that he had not. “But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer. “Do not you want to know who has taken it?” cried his wife impatiently. “_You_ want to tell me, and I have no objection to hearing it.” This was invitation enough. “Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to s

__Removal of chapter tags__

The word chapter appears 20 times

In [66]:
len(re.findall(r'Chapter ', step_1))

20

All of these 20 times are for new chapters (therefore the word does not appear in the text itself)

In [67]:
re.findall(r'Chapter \d* ', step_1)[18:20]

['Chapter 19 ', 'Chapter 20 ']

In [68]:
# Remove chapter tags
step_2 = re.sub(r'Chapter \d* ', '', step_1)
step_2[0:1000]

'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. “My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?” Mr. Bennet replied that he had not. “But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer. “Do not you want to know who has taken it?” cried his wife impatiently. “_You_ want to tell me, and I have no objection to hearing it.” This was invitation enough. “Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the pla

__Removal of digits__

Digits appear twice in the text

In [69]:
re.findall(r'\d+', step_2)

['15', '18']

Here is one example

In [70]:
re.findall(r'Monday, November 18th, by four o’clock, and shall probably trespass on your hospitality ', step_2)

['Monday, November 18th, by four o’clock, and shall probably trespass on your hospitality ']

In [71]:
# Remove all digits
step_3 = re.sub(r'\d+', '', step_2)
step_3[0:1000]

'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. “My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?” Mr. Bennet replied that he had not. “But it is,” returned she; “for Mrs. Long has just been here, and she told me all about it.” Mr. Bennet made no answer. “Do not you want to know who has taken it?” cried his wife impatiently. “_You_ want to tell me, and I have no objection to hearing it.” This was invitation enough. “Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the pla

__Remove futile symbols (non punctuation)__

In [72]:
# Remove other signs
step_4 = re.sub(r'["_“”\'\`\-\*\(\)]','',step_3)
step_4[0:1000]

'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. My dear Mr. Bennet, said his lady to him one day, have you heard that Netherfield Park is let at last? Mr. Bennet replied that he had not. But it is, returned she; for Mrs. Long has just been here, and she told me all about it. Mr. Bennet made no answer. Do not you want to know who has taken it? cried his wife impatiently. You want to tell me, and I have no objection to hearing it. This was invitation enough. Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large fortune from the north of England; that he came down on Monday in a chaise and four to see the place, and was so 

__Tokenize to sentences while punctuation is still in place__

In [73]:
# Tokenize
tokens_jane_austen = sent_tokenize(step_4)
tokens_jane_austen[0:10]

['It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.',
 'However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters.',
 'My dear Mr. Bennet, said his lady to him one day, have you heard that Netherfield Park is let at last?',
 'Mr. Bennet replied that he had not.',
 'But it is, returned she; for Mrs. Long has just been here, and she told me all about it.',
 'Mr. Bennet made no answer.',
 'Do not you want to know who has taken it?',
 'cried his wife impatiently.',
 'You want to tell me, and I have no objection to hearing it.',
 'This was invitation enough.']

__Convert each token to lowercase__

In [74]:
# Lowercase
lower_token = list(map(lambda token: token.lower(), tokens_jane_austen))
lower_token[0:5]

['it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.',
 'however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters.',
 'my dear mr. bennet, said his lady to him one day, have you heard that netherfield park is let at last?',
 'mr. bennet replied that he had not.',
 'but it is, returned she; for mrs. long has just been here, and she told me all about it.']

__Remove punctuation from lowercase token__

In [75]:
punct_less_token = list(map(lambda token: 
                            token.translate(str.maketrans('', '', string.punctuation)), lower_token))
punct_less_token[0:5]

['it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife',
 'however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered as the rightful property of some one or other of their daughters',
 'my dear mr bennet said his lady to him one day have you heard that netherfield park is let at last',
 'mr bennet replied that he had not',
 'but it is returned she for mrs long has just been here and she told me all about it']

### Corpus 2

__Sample string before treatment__

In [76]:
sample_string = corpus_2_list[1000]
sample_string

'From: zowie@daedalus.stanford.edu (Craig "Powderkeg" DeForest)\nSubject: Re: 5W30, 10W40, or 20W50\nArticle-I.D.: daedalus.ZOWIE.93Apr5215616\nOrganization: Stanford Center for Space Science and Astrophysics\nLines: 37\nNNTP-Posting-Host: daedalus.stanford.edu\nIn-reply-to: Brad Thone\'s message of Fri, 02 Apr 93 21:41:53 CST\n\nIn article <foo> Brad Thone <C09615BT@WUVMD> writes:\nWell, there *is* a difference.\n\nI don\'t happen to have my SAE manual handy, but oil viscosity in general\n_decreases_ with temperature.  The SAE numbers are based on a `typical\'\ncurve that oils used to all have, running from (say) the viscosity of a\nroom-temperature 90-weight at 0C, down to (say) that of a room-temperature \n5-weight at 20C, for a typical 40-weight oil.\n\nOils that are designed for operation in `normal\' temperatures just have\na weight specification.  Oils that are designed for operation in exceedingly\ncold temperatures have a `W\' tacked on the end, so in winter in a cold\nplace, 

__Removing blank spaces and new lines__

In [77]:
step_1 = re.sub('\s+', ' ', sample_string)
step_1

'From: zowie@daedalus.stanford.edu (Craig "Powderkeg" DeForest) Subject: Re: 5W30, 10W40, or 20W50 Article-I.D.: daedalus.ZOWIE.93Apr5215616 Organization: Stanford Center for Space Science and Astrophysics Lines: 37 NNTP-Posting-Host: daedalus.stanford.edu In-reply-to: Brad Thone\'s message of Fri, 02 Apr 93 21:41:53 CST In article <foo> Brad Thone <C09615BT@WUVMD> writes: Well, there *is* a difference. I don\'t happen to have my SAE manual handy, but oil viscosity in general _decreases_ with temperature. The SAE numbers are based on a `typical\' curve that oils used to all have, running from (say) the viscosity of a room-temperature 90-weight at 0C, down to (say) that of a room-temperature 5-weight at 20C, for a typical 40-weight oil. Oils that are designed for operation in `normal\' temperatures just have a weight specification. Oils that are designed for operation in exceedingly cold temperatures have a `W\' tacked on the end, so in winter in a cold place, you\'d stick 10W in your c

__Removing From/To and subject line of email__

Finding from to section

In [78]:
re.findall(r'\bFrom: .*? writes: ', step_1)

['From: zowie@daedalus.stanford.edu (Craig "Powderkeg" DeForest) Subject: Re: 5W30, 10W40, or 20W50 Article-I.D.: daedalus.ZOWIE.93Apr5215616 Organization: Stanford Center for Space Science and Astrophysics Lines: 37 NNTP-Posting-Host: daedalus.stanford.edu In-reply-to: Brad Thone\'s message of Fri, 02 Apr 93 21:41:53 CST In article <foo> Brad Thone <C09615BT@WUVMD> writes: ']

In [79]:
step_2 = re.sub(r'\bFrom: .*? writes: ', '', step_1)
step_2

"Well, there *is* a difference. I don't happen to have my SAE manual handy, but oil viscosity in general _decreases_ with temperature. The SAE numbers are based on a `typical' curve that oils used to all have, running from (say) the viscosity of a room-temperature 90-weight at 0C, down to (say) that of a room-temperature 5-weight at 20C, for a typical 40-weight oil. Oils that are designed for operation in `normal' temperatures just have a weight specification. Oils that are designed for operation in exceedingly cold temperatures have a `W' tacked on the end, so in winter in a cold place, you'd stick 10W in your car in the winter and 40 in it in the summer, to approximate the appropriate viscosity throughout the year. Modern multi-viscosity oils change viscosity much less with temperature. As a result, their viscosity graphs cross over several curves. A multi-vis specification pegs the curve at two temperatures, a `normal' operating temperature and a `cold' one (though I can't remember 

__Removal of digits__

In [80]:
# Remove all digits
step_3 = re.sub(r'\d+', '', step_2)
step_3

"Well, there *is* a difference. I don't happen to have my SAE manual handy, but oil viscosity in general _decreases_ with temperature. The SAE numbers are based on a `typical' curve that oils used to all have, running from (say) the viscosity of a room-temperature -weight at C, down to (say) that of a room-temperature -weight at C, for a typical -weight oil. Oils that are designed for operation in `normal' temperatures just have a weight specification. Oils that are designed for operation in exceedingly cold temperatures have a `W' tacked on the end, so in winter in a cold place, you'd stick W in your car in the winter and  in it in the summer, to approximate the appropriate viscosity throughout the year. Modern multi-viscosity oils change viscosity much less with temperature. As a result, their viscosity graphs cross over several curves. A multi-vis specification pegs the curve at two temperatures, a `normal' operating temperature and a `cold' one (though I can't remember the numbers.

__Remove futile symbols (non punctuation)__

In [81]:
# Remove other signs
step_4 = re.sub(r'["_“”\'\`\-\*\(\)]','',step_3)
step_4

'Well, there is a difference. I dont happen to have my SAE manual handy, but oil viscosity in general decreases with temperature. The SAE numbers are based on a typical curve that oils used to all have, running from say the viscosity of a roomtemperature weight at C, down to say that of a roomtemperature weight at C, for a typical weight oil. Oils that are designed for operation in normal temperatures just have a weight specification. Oils that are designed for operation in exceedingly cold temperatures have a W tacked on the end, so in winter in a cold place, youd stick W in your car in the winter and  in it in the summer, to approximate the appropriate viscosity throughout the year. Modern multiviscosity oils change viscosity much less with temperature. As a result, their viscosity graphs cross over several curves. A multivis specification pegs the curve at two temperatures, a normal operating temperature and a cold one though I cant remember the numbers.... In any event, the weights

__Tokenize to sentences while punctuation is still in place__

In [82]:
# Tokenize
tokens_news = sent_tokenize(step_4)
tokens_news

['Well, there is a difference.',
 'I dont happen to have my SAE manual handy, but oil viscosity in general decreases with temperature.',
 'The SAE numbers are based on a typical curve that oils used to all have, running from say the viscosity of a roomtemperature weight at C, down to say that of a roomtemperature weight at C, for a typical weight oil.',
 'Oils that are designed for operation in normal temperatures just have a weight specification.',
 'Oils that are designed for operation in exceedingly cold temperatures have a W tacked on the end, so in winter in a cold place, youd stick W in your car in the winter and  in it in the summer, to approximate the appropriate viscosity throughout the year.',
 'Modern multiviscosity oils change viscosity much less with temperature.',
 'As a result, their viscosity graphs cross over several curves.',
 'A multivis specification pegs the curve at two temperatures, a normal operating temperature and a cold one though I cant remember the numbers.

__Convert each token to lowercase__

In [83]:
# Lowercase
lower_token = list(map(lambda token: token.lower(), tokens_news))
lower_token[0:5]

['well, there is a difference.',
 'i dont happen to have my sae manual handy, but oil viscosity in general decreases with temperature.',
 'the sae numbers are based on a typical curve that oils used to all have, running from say the viscosity of a roomtemperature weight at c, down to say that of a roomtemperature weight at c, for a typical weight oil.',
 'oils that are designed for operation in normal temperatures just have a weight specification.',
 'oils that are designed for operation in exceedingly cold temperatures have a w tacked on the end, so in winter in a cold place, youd stick w in your car in the winter and  in it in the summer, to approximate the appropriate viscosity throughout the year.']

__Remove punctuation from lowercase token__

In [84]:
punct_less_token = list(map(lambda token: 
                            token.translate(str.maketrans('', '', string.punctuation)), lower_token))
punct_less_token[0:5]

['well there is a difference',
 'i dont happen to have my sae manual handy but oil viscosity in general decreases with temperature',
 'the sae numbers are based on a typical curve that oils used to all have running from say the viscosity of a roomtemperature weight at c down to say that of a roomtemperature weight at c for a typical weight oil',
 'oils that are designed for operation in normal temperatures just have a weight specification',
 'oils that are designed for operation in exceedingly cold temperatures have a w tacked on the end so in winter in a cold place youd stick w in your car in the winter and  in it in the summer to approximate the appropriate viscosity throughout the year']

# Word2Vec model

__Take clean sentence tokens list, and convert to word tokens list of list__

In [85]:
word_tokens_list_of_list = []

for news_story in corpus_2_list:
    tokenized_story = preprocessing_corpus_2(news_story)
    for sentence in tokenized_story:
        word_tokens_list = word_tokenize(sentence)
        word_tokens_list_of_list.append(word_tokens_list)
        
word_tokens_list_of_list[0:2]

[['lebanese',
  'resistance',
  'forces',
  'detonated',
  'a',
  'bomb',
  'under',
  'an',
  'israeli',
  'occupation',
  'patrol',
  'in',
  'lebanese',
  'territory',
  'two',
  'days',
  'ago'],
 ['three', 'soldiers', 'were', 'killed', 'and', 'two', 'wounded']]

__Build models__

In [86]:
def build_word2vec(list_of_list, dimension_size, window_size, min_obs, model_type, model_name):

    """
    Creates a model object
    Args:
        list_of_list (list): preprocessed text corpus
        dimension_size (int): size of dimensions in model
        window_size (int): window size used for training
        min_count (int): minimum observed instances of a word to be considered
        model_type (binary 1 or 0): 1 = skipgram, 0 = CBOW
        model_name: name of object
    Returns: 
        A trained word2vec model, that is saved as an object
    """

    new_model = gensim.models.Word2Vec(list_of_list, vector_size = dimension_size, window = window_size, 
                                       sg = model_type, min_count = min_obs)
    new_model.save(model_name)

    return new_model

__Create Skipgram model - with different vector and window parameters__

In [87]:
model_sg_50_3 = build_word2vec(word_tokens_list_of_list, 50, 3, 5, 1, "model_sg_50_3")
model_sg_50_5 = build_word2vec(word_tokens_list_of_list, 50, 5, 5, 1, "model_sg_50_5")
model_sg_50_7 = build_word2vec(word_tokens_list_of_list, 50, 7, 5, 1, "model_sg_50_7")
model_sg_100_3 = build_word2vec(word_tokens_list_of_list, 100, 3, 5, 1, "model_sg_100_3")
model_sg_100_5 = build_word2vec(word_tokens_list_of_list, 100, 5, 5, 1, "model_sg_100_5")
model_sg_100_7 = build_word2vec(word_tokens_list_of_list, 100, 7, 5, 1, "model_sg_100_7")
model_sg_200_3 = build_word2vec(word_tokens_list_of_list, 200, 3, 5, 1, "model_sg_200_3")
model_sg_200_5 = build_word2vec(word_tokens_list_of_list, 200, 5, 5, 1, "model_sg_200_5")
model_sg_200_7 = build_word2vec(word_tokens_list_of_list, 200, 7, 5, 1, "model_sg_200_7")

__Create CBOW model - with different vector and window parameters__

In [88]:
model_cbow_50_3 = build_word2vec(word_tokens_list_of_list, 50, 3, 5, 1, "model_cbow_50_3")
model_cbow_50_5 = build_word2vec(word_tokens_list_of_list, 50, 5, 5, 1, "model_cbow_50_5")
model_cbow_50_7 = build_word2vec(word_tokens_list_of_list, 50, 7, 5, 1, "model_cbow_50_7")
model_cbow_100_3 = build_word2vec(word_tokens_list_of_list, 100, 3, 5, 1, "model_cbow_100_3")
model_cbow_100_5 = build_word2vec(word_tokens_list_of_list, 100, 5, 5, 1, "model_cbow_100_5")
model_cbow_100_7 = build_word2vec(word_tokens_list_of_list, 100, 7, 5, 1, "model_cbow_100_7")
model_cbow_200_3 = build_word2vec(word_tokens_list_of_list, 200, 3, 5, 1, "model_cbow_200_3")
model_cbow_200_5 = build_word2vec(word_tokens_list_of_list, 200, 5, 5, 1, "model_cbow_200_5")
model_cbow_200_7 = build_word2vec(word_tokens_list_of_list, 200, 7, 5, 1, "model_cbow_200_7")

In [89]:
print(str(model_cbow_200_7.__repr__()))

<gensim.models.word2vec.Word2Vec object at 0x7f953f3cc890>


__Evaluate models based on identified nearest neighbors words__

In [90]:
evaluation_list = ['government', 'army', 'happy', 'food', 'pride',
                   'wealth', 'overwhelming', 'education','family','computer']

In [91]:
def evaluate_model_nn(model_object, index_model_name, evaluation_word_lists, top_n):

    """
    Identifies top n nearest words for words in a lit
    Args:
        model_object (obj): word2vec model
        index_model_name (str): name of model for row indices naming
        evaluation_word_lists (list): words to be used as reference to identify nearest neighbors
        top_n (int): top n number of nearest neighbors
    Returns: 
        A pd dataframe summary
    """

    
    # Create blank df
    results_df = pd.DataFrame()
    
    for i, word in enumerate(evaluation_word_lists):

        result = model_object.wv.most_similar(word, topn=top_n)
        results_df[word] = [result]

    results_df.index = [str(index_model_name)]

    return results_df

__Evaluating all models at once__

In [92]:
list_model_objects = [model_sg_50_3, model_sg_50_5, model_sg_50_7,
                      model_sg_100_3, model_sg_100_5, model_sg_100_7,
                      model_sg_200_3, model_sg_200_5, model_sg_200_7,
                     model_cbow_50_3, model_cbow_50_5, model_cbow_50_7,
                      model_cbow_100_3, model_cbow_100_5, model_cbow_100_7,
                      model_cbow_200_3, model_cbow_200_5, model_cbow_200_7]
list_index_names = ['sg_50_3', 'sg_50_5', 'sg_50_7',
                      'sg_100_3', 'sg_100_5', 'sg_100_7',
                      'sg_200_3', 'sg_200_5', 'sg_200_7',
                     'cbow_50_3', 'cbow_50_5', 'cbow_50_7',
                      'cbow_100_3', 'cbow_100_5', 'cbow_100_7',
                      'cbow_200_3', 'cbow_200_5', 'cbow_200_7']

agg_results = evaluate_model_nn(list_model_objects[0], list_index_names[0], evaluation_list, 3)

for i in range(1,18):
    # Add row to summary table
    new_row = pd.DataFrame(evaluate_model_nn(list_model_objects[i], list_index_names[i], evaluation_list, 3))
    agg_results = agg_results.append(new_row)

In [93]:
agg_results

Unnamed: 0,government,army,happy,food,pride,wealth,overwhelming,education,family,computer
sg_50_3,"[(greek, 0.804038941860199), (policies, 0.7906...","[(ottoman, 0.8858816623687744), (forces, 0.885...","[(surprised, 0.9102444648742676), (proud, 0.90...","[(trucks, 0.9271546602249146), (minimal, 0.917...","[(husbands, 0.9686049222946167), (mothers, 0.9...","[(steps, 0.9660560488700867), (substance, 0.96...","[(reduction, 0.9647369980812073), (defining, 0...","[(computers, 0.916782021522522), (administrati...","[(escaped, 0.8627897500991821), (dozens, 0.848...","[(voice, 0.8226866722106934), (network, 0.8091..."
sg_50_5,"[(greek, 0.8535125255584717), (genocide, 0.847...","[(russian, 0.8825209140777588), (dictatorship,...","[(careful, 0.8871726989746094), (comfortable, ...","[(friendly, 0.9072898030281067), (wives, 0.903...","[(hearts, 0.9454027414321899), (husbands, 0.94...","[(permits, 0.9637768864631653), (element, 0.94...","[(reduction, 0.9561704397201538), (determining...","[(fredom, 0.8925520181655884), (administrative...","[(whom, 0.8423151969909668), (attackers, 0.835...","[(science, 0.8152944445610046), (fax, 0.812277..."
sg_50_7,"[(xsoviet, 0.836116373538971), (removing, 0.81...","[(dictatorship, 0.9182860851287842), (organize...","[(careful, 0.8877199292182922), (confusing, 0....","[(wives, 0.8881337642669678), (clothing, 0.885...","[(husbands, 0.9269348978996277), (ancestors, 0...","[(element, 0.9534955620765686), (tendency, 0.9...","[(legitimacy, 0.9583801627159119), (promote, 0...","[(fredom, 0.9046643972396851), (psychology, 0....","[(mothers, 0.8127586841583252), (fate, 0.81243...","[(science, 0.8129076957702637), (electrical, 0..."
sg_100_3,"[(greek, 0.7724157571792603), (xsoviet, 0.7724...","[(forces, 0.8722518086433411), (azeri, 0.87006...","[(surprised, 0.9026287794113159), (glad, 0.893...","[(trucks, 0.9336310029029846), (jobs, 0.915132...","[(husbands, 0.966575026512146), (ears, 0.95711...","[(shelter, 0.9640524983406067), (element, 0.96...","[(injustice, 0.9650481343269348), (characteriz...","[(computers, 0.926155149936676), (labor, 0.920...","[(escaped, 0.865105152130127), (japanese, 0.85...","[(voice, 0.8510004281997681), (engineering, 0...."
sg_100_5,"[(xsoviet, 0.7944872975349426), (greek, 0.7880...","[(dictatorship, 0.9032689929008484), (moslem, ...","[(confusing, 0.8958973288536072), (careful, 0....","[(babies, 0.8926121592521667), (clothing, 0.88...","[(husbands, 0.9234886169433594), (colleagues, ...","[(proportional, 0.9533584117889404), (permits,...","[(dominant, 0.9538147449493408), (abuses, 0.95...","[(administrative, 0.8813538551330566), (core, ...","[(vast, 0.8456628322601318), (mothers, 0.84342...","[(science, 0.816775381565094), (electrical, 0...."
sg_100_7,"[(fascist, 0.8196161985397339), (xsoviet, 0.81...","[(dictatorship, 0.8783590197563171), (slaughte...","[(bet, 0.879052996635437), (glad, 0.8663113117...","[(clothing, 0.8769946694374084), (tenant, 0.87...","[(affected, 0.9152090549468994), (ancestors, 0...","[(covering, 0.9431804418563843), (element, 0.9...","[(blatant, 0.9583725333213806), (unacceptable,...","[(tourism, 0.8386237621307373), (objectives, 0...","[(fate, 0.8041701316833496), (travel, 0.774843...","[(electrical, 0.8056811094284058), (science, 0..."
sg_200_3,"[(genocide, 0.7920501828193665), (forces, 0.77...","[(moslem, 0.8863533735275269), (azeri, 0.88231...","[(confusing, 0.9133730530738831), (lucky, 0.90...","[(babies, 0.911504864692688), (clothing, 0.906...","[(husbands, 0.9670428037643433), (mothers, 0.9...","[(pool, 0.9658800363540649), (sysadmin, 0.9644...","[(rushdies, 0.963191568851471), (reduction, 0....","[(condescending, 0.9098212122917175), (labor, ...","[(mothers, 0.8618623614311218), (leaving, 0.85...","[(phone, 0.8377620577812195), (voice, 0.832298..."
sg_200_5,"[(greek, 0.8036022186279297), (xsoviet, 0.7999...","[(moslem, 0.896993100643158), (dictatorship, 0...","[(confusing, 0.8820159435272217), (careful, 0....","[(burn, 0.8931429386138916), (howling, 0.88812...","[(husbands, 0.9505138993263245), (hosts, 0.933...","[(element, 0.9557851552963257), (attribute, 0....","[(unacceptable, 0.9640464186668396), (mercy, 0...","[(fredom, 0.8822068572044373), (welfare, 0.881...","[(millions, 0.8485229015350342), (pictures, 0....","[(mechanical, 0.8181391358375549), (electrical..."
sg_200_7,"[(xsoviet, 0.807157576084137), (crime, 0.78833...","[(dictatorship, 0.8981454372406006), (moslem, ...","[(careful, 0.8562300205230713), (honest, 0.854...","[(clothing, 0.901617705821991), (babies, 0.877...","[(husbands, 0.9211340546607971), (ancestors, 0...","[(element, 0.9483727216720581), (abandon, 0.94...","[(promote, 0.9470370411872864), (indirectly, 0...","[(fredom, 0.86467444896698), (psychology, 0.84...","[(fate, 0.8046351671218872), (mothers, 0.79156...","[(electrical, 0.7914138436317444), (engineerin..."
cbow_50_3,"[(greek, 0.7928274869918823), (territories, 0....","[(forces, 0.8862621188163757), (azeri, 0.88594...","[(confusing, 0.9049172401428223), (comfortable...","[(trucks, 0.919986367225647), (minimal, 0.9139...","[(mothers, 0.9661276340484619), (wives, 0.9611...","[(steps, 0.9725327491760254), (youngsters, 0.9...","[(reduction, 0.9650214910507202), (characteriz...","[(computers, 0.921135663986206), (core, 0.9075...","[(sons, 0.8635645508766174), (escaped, 0.86204...","[(voice, 0.8237599730491638), (network, 0.8104..."


In [94]:
agg_results.to_csv('results_test.csv')