# Vector Space Model
###Nguyễn Hoàng Long - 20521568
###Đặng Thị Tường Vy - 20522176
###Trần Phương Thảo - 20521938

#  Import in-need Library

In [12]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from math import log10

In [13]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [14]:
%cd '/content/drive/MyDrive/BaiTapNhom1_CS419'

/content/drive/MyDrive/BaiTapNhom1_CS419


# Read Data from Cranfield Dataset

In [15]:
corpus = []
for file_name in range(1400):
    with open(f'Cranfield/{file_name+1}.txt', encoding='UTF-8') as f:
        doc = f.readline()
        corpus.append(doc)
        
print('Number of document in corpus: ', len(corpus))
corpus[:5]

Number of document in corpus:  1400


['experimental investigation of the aerodynamics of a wing in a slipstream . an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem . the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary layer control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory . an empirical evaluation of the destalling effects was made for the specific configuration of the experiment . ',
 "simple shear flow past a flat plate in an incompressible fluid of small viscosity . in t

In [16]:
import os
import re

root_path = '/content/drive/MyDrive/BaiTapNhom1_CS419/Cranfield/'

unordered_doc_paths, doc_paths = [], []
for file in os.listdir(root_path):
    if file.endswith(".txt"):
        unordered_doc_paths.append(int(re.sub(r"\D", "", file)))
        
unordered_doc_paths = sorted(unordered_doc_paths)
for s in unordered_doc_paths:
  doc_paths.append(f'{str(s)}.txt')
print(doc_paths)

['1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt', '15.txt', '16.txt', '17.txt', '18.txt', '19.txt', '20.txt', '21.txt', '22.txt', '23.txt', '24.txt', '25.txt', '26.txt', '27.txt', '28.txt', '29.txt', '30.txt', '31.txt', '32.txt', '33.txt', '34.txt', '35.txt', '36.txt', '37.txt', '38.txt', '39.txt', '40.txt', '41.txt', '42.txt', '43.txt', '44.txt', '45.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '51.txt', '52.txt', '53.txt', '54.txt', '55.txt', '56.txt', '57.txt', '58.txt', '59.txt', '60.txt', '61.txt', '62.txt', '63.txt', '64.txt', '65.txt', '66.txt', '67.txt', '68.txt', '69.txt', '70.txt', '71.txt', '72.txt', '73.txt', '74.txt', '75.txt', '76.txt', '77.txt', '78.txt', '79.txt', '80.txt', '81.txt', '82.txt', '83.txt', '84.txt', '85.txt', '86.txt', '87.txt', '88.txt', '89.txt', '90.txt', '91.txt', '92.txt', '93.txt', '94.txt', '95.txt', '96.txt', '97.txt', '98.txt', '99.txt', '100.txt', '101.tx

# Read query

In [47]:
with open('Test/query.txt', encoding='UTF-8') as f:
    q = f.readlines()
query = [query.split('\t')[1].strip() for query in q]


# Read relevant documents

In [20]:
relevants = []
for rel_file in range(225):
    with open(f'Test/RES/{rel_file+1}.txt', encoding='UTF-8') as f:
        rel = f.readlines()
        doc_id = [int(doc_id.split()[1].split('\t')[0]) for doc_id in rel]
        relevants.append(doc_id)
        
print('Number of Relevant document: ', len(relevants))
print(f'Relevant document of query 1: {relevants[0]}')

Number of Relevant document:  225
Relevant document of query 1: [184, 29, 31, 12, 51, 102, 13, 14, 15, 57, 378, 859, 185, 30, 37, 52, 142, 195, 875, 56, 66, 95, 462, 497, 858, 876, 879, 880, 486]


# Preprocessing data

In [21]:
def Convert_lowercase(doc):
    return doc.lower()

def Remove_punctuation(doc):
    punctuation = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~\n''' 
    for punc in punctuation:
        if punc in "'":
            doc = doc.replace(punc, '')   
        else:
            doc = doc.replace(punc, ' ')  
    return doc

def Remove_stopword(doc):
    stop_words = stopwords.words('english')
    new_doc = ''
    for word in word_tokenize(doc):
        if word not in stop_words:
                new_doc += word + ' '
    return new_doc

def Standar_number(doc):
    new_doc = ''
    for word in word_tokenize(doc):
        if word.isnumeric():
            new_doc += str(int(word)) + ' '
        else: new_doc += word + ' '
    return new_doc

def Stemming(doc):
    new_doc = ''
    for word in word_tokenize(doc):
        new_doc += PorterStemmer().stem(word) + ' '
    return new_doc


In [25]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
data = []
for doc in corpus:
    doc = Convert_lowercase(doc)
    doc = Remove_punctuation(doc)
    doc = Remove_stopword(doc)
    doc = Standar_number(doc)
    doc = Stemming(doc)
    data.append(doc)
    

In [27]:
data_token = [word_tokenize(data_token) for data_token in data]

In [28]:
print(data_token[0])

['experiment', 'investig', 'aerodynam', 'wing', 'slipstream', 'experiment', 'studi', 'wing', 'propel', 'slipstream', 'made', 'order', 'determin', 'spanwis', 'distribut', 'lift', 'increas', 'due', 'slipstream', 'differ', 'angl', 'attack', 'wing', 'differ', 'free', 'stream', 'slipstream', 'veloc', 'ratio', 'result', 'intend', 'part', 'evalu', 'basi', 'differ', 'theoret', 'treatment', 'problem', 'compar', 'span', 'load', 'curv', 'togeth', 'support', 'evid', 'show', 'substanti', 'part', 'lift', 'increment', 'produc', 'slipstream', 'due', 'destal', 'boundari', 'layer', 'control', 'effect', 'integr', 'remain', 'lift', 'increment', 'subtract', 'destal', 'lift', 'found', 'agre', 'well', 'potenti', 'flow', 'theori', 'empir', 'evalu', 'destal', 'effect', 'made', 'specif', 'configur', 'experi']


# Create Indexing

In [29]:
def term_unique(data_token):
    terms = []
    for i in range(len(data_token)):
        terms.extend(data_token[i])
    terms = sorted(list(set(terms)))
    return terms
                
def Vocabulary_and_Posting(data_token, terms):
    terms = terms                                   # Từ khóa
    num_doc = [0 for i in range(len(terms))]        # Số tài liệu
    freq = [0 for i in range(len(terms))]           # Tần số của từ khóa cho tập từ vựng
    doc_id = [[] for i in range(len(terms))]        # Chỉ số tài liệu
    freq_of_docs = [[] for i in range(len(terms))]  # Tần số từ khóa cho DS Posting

    for i in range(len(terms)):
        for j in range(len(data_token)):
            temp = [index for index, value in enumerate(data_token[j]) if value==terms[i]]
            if len(temp) > 0:
                num_doc[i] += 1
                freq[i] += len(temp)
                doc_id[i].append(j+1)
                freq_of_docs[i].append(len(temp))
    return terms, num_doc, freq, doc_id, freq_of_docs

In [30]:
terms = term_unique(data_token)
len(terms)

4703

In [31]:
terms, num_doc, freq, doc_id, freq_of_docs = Vocabulary_and_Posting(data_token, terms)


In [32]:
pd.DataFrame({'Từ khóa':terms, 'Số tài liệu':num_doc, 'Tần số':freq, 'Chỉ số tài liệu':doc_id, 'Tần số trong mỗi tài liệu':freq_of_docs})

Unnamed: 0,Từ khóa,Số tài liệu,Tần số,Chỉ số tài liệu,Tần số trong mỗi tài liệu
0,0,254,559,"[9, 23, 40, 44, 50, 52, 53, 57, 59, 62, 63, 74...","[2, 1, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 2, 1, 1, ..."
1,000degre,1,1,[302],[1]
2,000degreek,2,3,"[572, 1274]","[2, 1]"
3,000k,1,1,[1314],[1]
4,0degre,7,9,"[688, 713, 782, 972, 1075, 1077, 1341]","[1, 1, 1, 1, 1, 3, 1]"
...,...,...,...,...,...
4698,zhukhovitskii,1,1,[270],[1]
4699,zone,13,20,"[14, 126, 167, 218, 243, 455, 828, 960, 1072, ...","[2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 1, 1]"
4700,zoom,1,2,[374],[2]
4701,zuk,1,1,[890],[1]


In [33]:
# Vector tần số của các document 
doc_vec = [[0 for i in range(len(terms))] for _ in range(1400)]

for i in range(len(doc_id)):
    for j in range(len(doc_id[i])):
        doc_vec[doc_id[i][j]-1][i] = freq_of_docs[i][j] 

Bảng thống kê tần số của các từ trong tài liệu

In [34]:
pd.DataFrame(data=doc_vec, columns=terms, index=[f'Doc_{i+1}' for i in range(1400)])

Unnamed: 0,0,000degre,000degreek,000k,0degre,1,10,100,1000,100degre,...,zamm,zbrozek,zehnder,zero,zeroth,zhukhovitskii,zone,zoom,zuk,zurich
Doc_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc_1396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_1397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Doc_1398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc_1399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculate TF-IDF of each element in Document

Calculate IDF, IF, W

In [35]:
doc_idf = [log10(len(num_doc)/n_doc) for n_doc in num_doc]
doc_tf = list(doc_vec)
doc_w = list(doc_vec)
for i in range(len(doc_vec)):
    if max(doc_vec[i]) != 0:
        norm = 0
        for j in range(len(doc_vec[i])):
            if doc_vec[i][j] != 0:
                doc_tf[i][j] = log10(doc_vec[i][j]) + 1 
                norm += np.square(doc_tf[i][j]*doc_idf[j])
        norm = np.sqrt(norm)
        doc_w[i] = [(doc_tf[i][j] * doc_idf[j])/norm for j in range(len(doc_tf[i]))]

Ma trận trọng số các các từ trong từng tài liệu

In [36]:
pd.DataFrame(data=doc_w, columns=terms, index=[f'Doc_{i+1}' for i in range(1400)])

Unnamed: 0,0,000degre,000degreek,000k,0degre,1,10,100,1000,100degre,...,zamm,zbrozek,zehnder,zero,zeroth,zhukhovitskii,zone,zoom,zuk,zurich
Doc_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc_1396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_1397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.135638,0.0,0.0,0.0,0.0,0.0,0.0
Doc_1398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
Doc_1399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


# Process Query

In [37]:
query_processed = []
for q in query:
    q = Convert_lowercase(q)
    q = Remove_punctuation(q)
    q = Remove_stopword(q)
    q = Standar_number(q)
    q = Stemming(q)
    query_processed.append(q)

In [48]:
query[:0]

[]

In [39]:
query_token = [word_tokenize(q) for q in query_processed]
for i in range(5):
    print(query_token[i])

['similar', 'law', 'must', 'obey', 'construct', 'aeroelast', 'model', 'heat', 'high', 'speed', 'aircraft']
['structur', 'aeroelast', 'problem', 'associ', 'flight', 'high', 'speed', 'aircraft']
['problem', 'heat', 'conduct', 'composit', 'slab', 'solv', 'far']
['criterion', 'develop', 'show', 'empir', 'valid', 'flow', 'solut', 'chemic', 'react', 'ga', 'mixtur', 'base', 'simplifi', 'assumpt', 'instantan', 'local', 'chemic', 'equilibrium']
['chemic', 'kinet', 'system', 'applic', 'hyperson', 'aerodynam', 'problem']


# Calculate TF-IDF of each Query


In [40]:
query_tf = [[0 for i in range(len(terms))] for _ in range(len(query))]
query_idf = list(doc_idf)
query_w = [[0 for i in range(len(terms))] for _ in range(len(query))]

for i in range(len(query_token)):
    norm, check = 0, False
    for j in range(len(query_token[i])):
        if query_token[i][j] in terms:
            index = terms.index(query_token[i][j])
            query_tf[i][index] = log10(query_token[i].count(query_token[i][j]))+1 
            norm += np.square(query_tf[i][index]*query_idf[index])
            check = True
    if check:
        norm = np.sqrt(norm)
        query_w[i] = [(query_tf[i][j] * query_idf[j])/norm for j in range(len(query_tf[i]))]


In [41]:
pd.DataFrame(data=query_w, columns=terms, index=[f'Query_{i+1}' for i in range(len(query))])

Unnamed: 0,0,000degre,000degreek,000k,0degre,1,10,100,1000,100degre,...,zamm,zbrozek,zehnder,zero,zeroth,zhukhovitskii,zone,zoom,zuk,zurich
Query_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Query_221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Query_224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Define Cosine Similarity

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
similars = cosine_similarity(query_w, doc_w)   
doc_retrivals = [np.argsort(similars[i])[::-1]+1 for i in range(len(similars))] 

In [43]:
print(similars[0])

[0.         0.06979493 0.         ... 0.         0.02967777 0.        ]


# Calculate MAP

In [44]:
P_of_querys, R_of_querys = [], []

for i in range(len(query)):
    count_rel, P_of_query, R_of_query = 0, [], []
    for j in range(1400):
        if doc_retrivals[i][j] in relevants[i]:
            count_rel += 1
            P_of_query.append(count_rel / (j+1))
            R_of_query.append(count_rel / len(relevants[i]))
        if count_rel == len(relevants[i]):
            break
    P_of_querys.append(P_of_query)
    R_of_querys.append(R_of_query)


In [49]:
AP = []
for i in range(len(query)):
    P = []
    for j in range(11):
        r, max_p = j/10, 0
        for k in range(len(R_of_querys[i])):
            if R_of_querys[i][k] >= r and P_of_querys[i][k] > max_p:
                max_p = P_of_querys[i][k]
        P.append(max_p)
    AP.append(sum(P)/11)
    
MAP = sum(AP)/len(query)

In [50]:
MAP


0.42115873589130604