# Vector Space Model

Simple implementation, quite inefficient.

In [17]:
from collections import defaultdict
from math import log,sqrt #tf-idf
import re #regular expression 

In [10]:
def import_dataset():
    articles = []
    with open('collections/TIME.ALL') as f:
        tmp = []
        for row in f:
            if row.startswith('*TEXT'):
                if tmp!= []:
                    articles.append(tmp)
                #empty collection of the terms we found so far
                tmp = []
            else:
                #keep only alphabetic strings
                row = re.sub(r'^a-zA-Z\s','', row)
                tmp += row.split()
        return articles

In [12]:
import_dataset()[2]

['BERLIN',
 'ONE',
 'LAST',
 'RUN',
 'HANS',
 'WEIDNER',
 'HAD',
 'BEEN',
 'HOPING',
 'FOR',
 'MONTHS',
 'TO',
 'ESCAPE',
 'DRAB',
 'EAST',
 'GERMANY',
 'AND',
 'MAKE',
 'HIS',
 'WAY',
 'TO',
 'THE',
 'WEST',
 '.',
 'THE',
 'ODDS',
 'WERE',
 'AGAINST',
 'HIM,',
 'FOR',
 'WEIDNER,',
 '40,',
 'WAS',
 'A',
 'CRIPPLE',
 'ON',
 'CRUTCHES',
 'WHO',
 'LIVED',
 'IN',
 'THE',
 'VILLAGE',
 'OF',
 'NEUGERSDORF,',
 '115',
 'MILES',
 'SOUTHEAST',
 'OF',
 'THE',
 'FRONTIER',
 'OF',
 'FREEDOM',
 'BUT',
 'HANS',
 'WEIDNER',
 'DID',
 'HAVE',
 'ONE',
 'MAJOR',
 'ASSET,',
 'THE',
 'BUS',
 'THAT',
 'HE',
 'OPERATED',
 'FOR',
 'THE',
 'LOCAL',
 'COMMUNIST',
 'REGIME',
 '.',
 'IT',
 'WAS',
 'AN',
 'UGLY',
 'THING,',
 'AND',
 'ANCIENT',
 '.',
 'ITS',
 'CHASSIS',
 'CREAKED,',
 'AND',
 'THE',
 'ENGINE',
 'COUGHED',
 ';',
 'A',
 'CREAM-COLORED',
 'COAT',
 'OF',
 'PAINT',
 'COULD',
 'NOT',
 'DISGUISE',
 'THE',
 'WELTS',
 'AND',
 'BRUISES',
 'OF',
 'TWO',
 'DECADES',
 'OF',
 'CHUGGING',
 'SERVICE',
 '.',
 'IN',
 

We now want to index: we need term frequency. 

We also want to build positional index, for phrase queries. 

In [18]:
def make_positional_index(articles):
    index = defaultdict(dict)
    for docid, article in enumerate(articles):
        #for each of the articles we can take the term and the position
        #in which the terms appear
        for pos, term in enumerate(article):
            try:
                index[term][docid].append(pos)
            except KeyError:
                index[term][docid] = [pos]
    return index

In [20]:
articles = import_dataset()
index = make_positional_index(articles)
index['BERLIN']

{2: [0, 302, 393, 522],
 8: [379],
 14: [158, 231],
 46: [3880],
 73: [160, 214],
 80: [0, 14, 169, 192, 220],
 205: [1, 65, 131, 249, 332, 490, 530],
 271: [873],
 278: [14, 179, 475, 666],
 279: [69],
 294: [680, 718],
 298: [0, 14, 32],
 305: [645, 651, 662],
 315: [11, 115],
 342: [3979],
 363: [79],
 366: [266, 507, 1346],
 379: [263, 292],
 401: [20],
 414: [37, 143]}

In [21]:
#vector representing the document
def document_as_vectors(articles):
    p_index = make_positional_index(articles)
    vectors = [] #all the vectors representing all the documents
    n = len(articles)
    idf = {}
    for term in p_index.keys():
        idf[term] = log(n/len(p_index[term]))
    for docid in range(0, len(articles)):
        v = {}
        for term in p_index.keys():
            #for each term we have a component of this vector to fill
            try: 
                tfidf = len(p_index[term][docid]) * idf[term]
            except KeyError:
                tfidf = 0
            v[term] = tfidf
        #need to fill v with entry which contain the weight given by tfidf
        vectors.append(v)
    return vectors

In [24]:
vectors = document_as_vectors(articles)
vectors[0]

{'THE': 0.0,
 'ALLIES': 5.355418968099076,
 'AFTER': 1.9269837964911969,
 'NASSAU': 12.75973753442387,
 'IN': 0.023724803536303955,
 'DECEMBER': 3.6471100412376414,
 '1960,': 3.405947984420753,
 'U.S': 13.03283372193274,
 '.': 0.12336897838878057,
 'FIRST': 3.4980852759914405,
 'PROPOSED': 2.78690877601453,
 'TO': 0.06405696954802068,
 'HELP': 5.802394349588101,
 'NATO': 12.44828626273299,
 'DEVELOP': 7.931127544712352,
 'ITS': 3.721035975097208,
 'OWN': 3.87932599520874,
 'NUCLEAR': 22.838051983424492,
 'STRIKE': 5.287615864747713,
 'FORCE': 13.009317649915312,
 'BUT': 0.451807439313483,
 'EUROPE': 4.187523190909169,
 'MADE': 1.1774708635804294,
 'NO': 3.0346300255640823,
 'ATTEMPT': 2.5186447894198505,
 'DEVISE': 5.351858133476067,
 'A': 0.0,
 'PLAN': 2.283805198342449,
 'LAST': 0.39833882171774687,
 'WEEK,': 1.0145673926435763,
 'AS': 0.680298332735925,
 'THEY': 1.3845270157629384,
 'STUDIED': 3.965563772356176,
 'ACCORD': 4.658710952916121,
 'BETWEEN': 1.4006144148946391,
 'PRESIDE

In [25]:
def show_document_vector(v, docid):
    #we want to remove all the zero terms
    non_zero_terms = [x for x in v[docid].keys() if v[docid][x]>0]
    vector = [(x,v[docid][x]) for x in non_zero_terms]
    vector.sort(key=lambda x: x[1], reverse=True)
    length = sqrt(sum([x[1]**2 for x in vector]))
    normalized = {k: tfidf/length for k,tfidf in vector}
    for(term, tfidf) in vector:
        print(f"{term}:\t{tfidf}\t(normalized:{normalized[term]})")