Read the .pdf files


In [2]:
#!pip install --upgrade pymupdf

### Step 1:  Implement word-based or subwordy tokenization

The following code was taken from https://pymupdf.readthedocs.io/en/latest/the-basics.html.

I have chosen library pymupdf since it works correctly with ligatures.

In [3]:
import pymupdf

doc = pymupdf.open("paper1.pdf") # open a document
out = open("output.txt", "wb") # create a text output
for page in doc: # iterate the document pages
    text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
    out.write(text) # write text of page
    out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()

In [4]:
import re
with open("output.txt", "rb") as file:
    content = file.read()

decoded_content = content.decode("utf8")
decoded_content

'arXiv:1310.4546v1  [cs.CL]  16 Oct 2013\nDistributed Representations of Words and Phrases\nand their Compositionality\nTomas Mikolov\nGoogle Inc.\nMountain View\nmikolov@google.com\nIlya Sutskever\nGoogle Inc.\nMountain View\nilyasu@google.com\nKai Chen\nGoogle Inc.\nMountain View\nkai@google.com\nGreg Corrado\nGoogle Inc.\nMountain View\ngcorrado@google.com\nJeffrey Dean\nGoogle Inc.\nMountain View\njeff@google.com\nAbstract\nThe recently introduced continuous Skip-gram model is an efﬁcient method for\nlearning high-quality distributed vector representations that capture a large num-\nber of precise syntactic and semantic word relationships. In this paper we present\nseveral extensions that improve both the quality of the vectors and the training\nspeed. By subsampling of the frequent words we obtain signiﬁcant speedup and\nalso learn more regular word representations. We also describe a simple alterna-\ntive to the hierarchical softmax called negative sampling.\nAn inherent limitati

The following regular expression accepts words from a text and excludes words that are purely numeric, no special symbols included.

In [5]:
words_dist = re.findall(r'\b(?!\d+\b)\w+\b', decoded_content)

print(words_dist)

['arXiv', '4546v1', 'cs', 'CL', 'Oct', 'Distributed', 'Representations', 'of', 'Words', 'and', 'Phrases', 'and', 'their', 'Compositionality', 'Tomas', 'Mikolov', 'Google', 'Inc', 'Mountain', 'View', 'mikolov', 'google', 'com', 'Ilya', 'Sutskever', 'Google', 'Inc', 'Mountain', 'View', 'ilyasu', 'google', 'com', 'Kai', 'Chen', 'Google', 'Inc', 'Mountain', 'View', 'kai', 'google', 'com', 'Greg', 'Corrado', 'Google', 'Inc', 'Mountain', 'View', 'gcorrado', 'google', 'com', 'Jeffrey', 'Dean', 'Google', 'Inc', 'Mountain', 'View', 'jeff', 'google', 'com', 'Abstract', 'The', 'recently', 'introduced', 'continuous', 'Skip', 'gram', 'model', 'is', 'an', 'efﬁcient', 'method', 'for', 'learning', 'high', 'quality', 'distributed', 'vector', 'representations', 'that', 'capture', 'a', 'large', 'num', 'ber', 'of', 'precise', 'syntactic', 'and', 'semantic', 'word', 'relationships', 'In', 'this', 'paper', 'we', 'present', 'several', 'extensions', 'that', 'improve', 'both', 'the', 'quality', 'of', 'the', 'v

I would use list of stop words from nltk library and some additions(https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://gist.github.com/sebleier/554280&ved=2ahUKEwi_yr6BisiGAxXehv0HHSQ_Bw0QFnoECBoQAQ&usg=AOvVaw2MnvfyOnFEZMA88TBgjhV1)

In [6]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
              "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
              'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 
              'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
              "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 
              'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
              'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during',
              'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
              'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
              'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
              'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
              'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn',
              "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
              "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2",
              "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", 
              "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again",
              "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also",
              "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody",
              "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear",
              "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside",
              "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1",
              "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before",
              "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides",
              "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", 
              "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot",
              "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit",
              "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently",
              "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", 
              "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date",
              "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", 
              "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards",
              "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee",
              "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty",
              "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", 
              "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", 
              "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill",
              "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former",
              "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", 
              "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go",
              "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't",
              "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd",
              "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon",
              "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully",
              "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", 
              "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", 
              "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", 
              "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn",
              "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", 
              "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely",
              "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's",
              "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", 
              "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", 
              "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", 
              "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", 
              "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", 
              "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine",
              "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", 
              "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od",
              "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", 
              "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves",
              "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages",
              "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk",
              "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr",
              "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps",
              "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd",
              "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards",
              "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh",
              "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said",
              "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem",
              "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several",
              "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't",
              "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar",
              "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow",
              "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically",
              "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", 
              "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", 
              "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that",
              "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there",
              "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's",
              "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've",
              "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand",
              "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", 
              "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt",
              "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", 
              "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully",
              "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", 
              "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", 
              "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", 
              "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas",
              "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who",
              "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will",
              "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn",
              "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y",
              "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves",
              "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]



Remove all stop-words from list

In [7]:
words_dist = [word for word in words_dist if word.lower() not in stop_words]

Do the same steps for the second paper

In [8]:
doc = pymupdf.open("paper2.pdf") # open a document
out = open("output1.txt", "wb") # create a text output
for page in doc: # iterate the document pages
    text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
    out.write(text) # write text of page
    out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()

In [9]:
with open("output1.txt", "rb") as file:
    content = file.read()

words_att = re.findall(r'\b(?!\d+\b)\w+\b', content.decode("utf8"))

In [10]:
words_att = [word for word in words_att if word.lower() not in stop_words]

### Step 2:  Implement the Bag of Words model

To do this, I add words from both texts to one dictionary, where the key - the word itself and value is the number of times this word is in list

In [11]:
bag_words = {}
for word in words_att:
    if word in bag_words:
        bag_words[word] += 1
    else:
        bag_words[word] = 1
for word in words_dist:
    if word in bag_words:
        bag_words[word] += 1
    else:
        bag_words[word] = 1
bag_words

{'Provided': 1,
 'proper': 1,
 'attribution': 1,
 'provided': 1,
 'Google': 17,
 'grants': 1,
 'permission': 1,
 'reproduce': 1,
 'tables': 1,
 'figures': 1,
 'paper': 8,
 'solely': 2,
 'journalistic': 1,
 'scholarly': 1,
 'works': 3,
 'Attention': 18,
 'Ashish': 2,
 'Vaswani': 1,
 'Brain': 4,
 'avaswani': 1,
 'google': 14,
 'Noam': 4,
 'Shazeer': 3,
 'noam': 1,
 'Niki': 2,
 'Parmar': 1,
 'nikip': 1,
 'Jakob': 3,
 'Uszkoreit': 2,
 'usz': 1,
 'Llion': 2,
 'Jones': 1,
 'llion': 1,
 'Aidan': 2,
 'Gomez': 1,
 'University': 2,
 'Toronto': 8,
 'aidan': 1,
 'toronto': 1,
 'Łukasz': 3,
 'Kaiser': 7,
 'lukaszkaiser': 1,
 'Illia': 2,
 'Polosukhin': 1,
 'illia': 1,
 'polosukhin': 1,
 'gmail': 1,
 'Abstract': 2,
 'dominant': 1,
 'sequence': 40,
 'transduction': 8,
 'models': 60,
 'based': 15,
 'complex': 2,
 'recurrent': 17,
 'convolutional': 5,
 'neural': 28,
 'networks': 19,
 'include': 1,
 'encoder': 25,
 'decoder': 24,
 'performing': 3,
 'connect': 2,
 'attention': 75,
 'mechanism': 6,
 'propo

In [12]:
len(bag_words)

2031

I want to shrink list of words to 25% of all words and take top-500 most popular words in it

In [13]:
sorted_data = sorted(bag_words.items(), key=lambda bag_words: bag_words[1], reverse=True)

most_used = sorted_data[:500]
most_used = [ pair[0] for pair in most_used]
most_used

['model',
 'attention',
 'training',
 'models',
 'representations',
 'word',
 'sequence',
 'arXiv',
 'layer',
 'output',
 'Skip',
 'gram',
 'layers',
 'neural',
 'vectors',
 'phrases',
 'Transformer',
 'input',
 'encoder',
 'Table',
 'decoder',
 'learning',
 'vector',
 'translation',
 'task',
 'positions',
 'language',
 'softmax',
 'values',
 'pad',
 'networks',
 'English',
 'Attention',
 'simple',
 'large',
 'trained',
 'vec',
 'Google',
 'recurrent',
 'data',
 'function',
 'preprint',
 'subsampling',
 'network',
 'machine',
 'tasks',
 'work',
 'learned',
 'linear',
 'size',
 'length',
 'based',
 'quality',
 'set',
 'google',
 'Neural',
 'tokens',
 'phrase',
 'frequent',
 'single',
 'position',
 'number',
 'Figure',
 'dmodel',
 'accuracy',
 'Bengio',
 'NCE',
 'time',
 'BLEU',
 'dot',
 'Conference',
 'performance',
 'sentence',
 'heads',
 'Learning',
 'dataset',
 'base',
 'WSJ',
 'Proceedings',
 'hierarchical',
 'log',
 'NEG',
 'architecture',
 'state',
 'product',
 'head',
 'represent

### Step 3: Generate vectors
For each word calculate how many times it were in the text and compose a vector

In [14]:
import numpy as np
vector_dist = np.zeros(500, dtype=int)
for word in words_dist:
    if word in most_used:
        index = most_used.index(word)
        vector_dist[index] += 1
vector_dist

array([35,  0, 43, 23, 38, 40,  1,  3,  2,  4, 33, 32,  0, 13, 25, 27,  0,
        2,  0, 12,  0, 13, 24,  1, 13,  1, 15, 16,  2,  0,  4,  0,  0, 14,
       11,  9, 18,  7,  2, 15,  3,  1, 17,  7,  2,  3,  9,  9,  9,  8,  1,
        8, 10,  6,  8,  2,  5, 12, 14,  2,  0,  3,  3,  0, 11,  6, 12,  5,
        0,  0,  3,  8,  5,  0,  5,  9,  0,  0,  6, 11, 11, 11,  4,  0,  2,
        0,  5,  3,  3,  4,  1,  2,  0,  0, 10, 10,  1,  0,  1,  2,  1,  5,
        0,  2,  6,  0,  6,  0,  0,  5,  1,  4,  1,  9,  6,  7,  0,  0,  1,
        4,  0,  2,  4,  0,  0,  0,  0,  8,  0,  3,  2,  0,  2,  0,  2,  0,
        1,  0,  1,  6,  3,  5,  4,  6,  6,  6,  7,  0,  0,  0,  3,  2,  0,
        1,  0,  1,  1,  2,  2,  0,  0,  0,  0,  2,  3,  2,  0,  2,  5,  1,
        1,  4,  5,  0,  3,  0,  0,  0,  1,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  0,  1,  1,  0,  1,  0,  0,  2,
        1,  0,  0,  0,  2,  0,  0,  0,  0,  3,  0,  2,  4,  3,  0,  2,  3,
        3,  3,  1,  0,  0

In [15]:
vector_att = np.zeros(500, dtype=int)
for word in words_att:
    if word in most_used:
        index = most_used.index(word)
        vector_att[index] += 1
vector_att

array([41, 75, 22, 37,  9,  6, 39, 33, 33, 29,  0,  0, 29, 15,  2,  0, 26,
       24, 25, 13, 24, 11,  0, 22, 10, 22,  7,  6, 18, 20, 15, 19, 18,  4,
        7,  9,  0, 10, 15,  2, 14, 16,  0,  9, 14, 13,  7,  7,  7,  8, 15,
        7,  5,  9,  6, 12,  9,  2,  0, 11, 13, 10,  9, 12,  1,  6,  0,  6,
       11, 11,  8,  3,  6, 11,  6,  2, 11, 11,  5,  0,  0,  0,  6, 10,  8,
       10,  5,  7,  7,  6,  9,  8, 10, 10,  0,  0,  8,  9,  8,  7,  8,  4,
        9,  7,  3,  9,  3,  9,  9,  4,  8,  5,  8,  0,  2,  1,  8,  8,  7,
        4,  8,  6,  4,  8,  8,  8,  8,  0,  7,  4,  5,  7,  5,  7,  5,  7,
        6,  7,  6,  1,  4,  2,  3,  1,  1,  1,  0,  6,  6,  6,  3,  4,  6,
        5,  6,  5,  5,  4,  4,  6,  6,  6,  6,  4,  3,  4,  6,  4,  1,  5,
        5,  2,  1,  6,  3,  6,  6,  6,  5,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  4,  4,  5,  4,  5,  5,  3,
        4,  5,  5,  5,  3,  5,  5,  5,  5,  2,  5,  3,  1,  2,  5,  3,  2,
        2,  2,  4,  5,  5

Calculations of cosine similarity

In [16]:
def cosine_sim(a, b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [17]:
cosine_sim(vector_att, vector_dist)

0.3254797972342878

### Step 4: Interpret measured cosine similarity
#### Cosine similarity:

1 means the vectors are identical

0 means the vectors are orthogonal (no similarity)

-1 means the vectors are diametrically opposed (completely dissimilar)

Therefore, the similarity is relatively low. The texts are not very similar, but there is still some overlap in the vocabulary or context. These two documents would indicate that they share some thematic elements but also have differences in wording and some unique content not shared by the other document.

In [18]:
text = "I really like fish, it reminds me of Innopolis, Tomatoes is ok, hard to forget Penza. Wish to forget, Sometimes I visit concert, reminds of how hard to get things done, This course is lame. Some fish write this tomatoes lectures. Sometimes Im thinking about Penza"


In [19]:
check = re.findall(r'\b(?!\d+\b)\w+\b', text)
s1 = re.findall(r'\b(?!\d+\b)\w+\b',"Tomatoes is ok, hard to forget Penza. Wish to forget.")
s2 = re.findall(r'\b(?!\d+\b)\w+\b', "Sometimes I visit concert, reminds of how hard to get things done")

In [23]:
bag_words = {}
for word in check:
    if word in bag_words:
        bag_words[word] += 1
    else:
        bag_words[word] = 1

most_used = [ pair for pair in bag_words.keys()]
most_used

['I',
 'really',
 'like',
 'fish',
 'it',
 'reminds',
 'me',
 'of',
 'Innopolis',
 'Tomatoes',
 'is',
 'ok',
 'hard',
 'to',
 'forget',
 'Penza',
 'Wish',
 'Sometimes',
 'visit',
 'concert',
 'how',
 'get',
 'things',
 'done',
 'This',
 'course',
 'lame',
 'Some',
 'write',
 'this',
 'tomatoes',
 'lectures',
 'Im',
 'thinking',
 'about']

In [21]:
ch1 = np.zeros(len(most_used), dtype=int)
for word in s1:
    if word in most_used:
        index = most_used.index(word)
        ch1[index] += 1
ch1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
ch2 = np.zeros(len(most_used), dtype=int)
for word in s2:
    if word in most_used:
        index = most_used.index(word)
        ch2[index] += 1
ch2

array([1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [51]:
np.dot(ch1,ch2)

3

In [52]:
np.linalg.norm(ch1)

3.7416573867739413

In [53]:
np.linalg.norm(ch2)

3.4641016151377544

In [54]:
cosine_sim(ch1,ch2)

0.23145502494313788

In [56]:
3/(3.74*3.46)

0.23183209174368644