 Information Retrieval System using Word2Vec Model and vector Space Model

<h4>1. Loading the cran1400 dataset</h4>

In [1]:
!pip install gensim



In [2]:
import gensim
import pandas as pd

In [11]:
df = pd.read_xml("cran.all.1400.xml",xpath='//doc')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   docno   1400 non-null   int64 
 1   title   1398 non-null   object
 2   author  1347 non-null   object
 3   bib     1330 non-null   object
 4   text    1398 non-null   object
dtypes: int64(1), object(4)
memory usage: 54.8+ KB


In [13]:
df.head()

Unnamed: 0,docno,title,author,bib,text
0,1,experimental investigation of the aerodynamics...,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...
1,2,simple shear flow past a flat plate in an inco...,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...
2,3,the boundary layer in simple shear flow past a...,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...
3,4,approximate solutions of the incompressible la...,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...
4,5,one-dimensional transient heat conduction into...,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...


<h4>2. Data preprocessing</h4>

In [14]:
#Data preprocessing
#removing null and duplicate values

df.drop_duplicates(['text'], inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1327 entries, 0 to 1399
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   docno   1327 non-null   int64 
 1   title   1327 non-null   object
 2   author  1327 non-null   object
 3   bib     1327 non-null   object
 4   text    1327 non-null   object
dtypes: int64(1), object(4)
memory usage: 62.2+ KB


In [15]:
#further preprocessing
#converting all the words to lower case, trimming spaces, removing punctuations,removing stopwords
#This would also TOKENIZE the text data

text = df.text.apply(gensim.utils.simple_preprocess)

In [16]:
text

0       [experimental, investigation, of, the, aerodyn...
1       [simple, shear, flow, past, flat, plate, in, a...
2       [the, boundary, layer, in, simple, shear, flow...
3       [approximate, solutions, of, the, incompressib...
4       [one, dimensional, transient, heat, conduction...
                              ...                        
1395    [shear, buckling, of, clamped, and, simply, su...
1396    [critical, shear, stress, of, an, infinitely, ...
1397    [stability, of, rectangular, plates, under, sh...
1398    [buckling, of, transverse, stiffened, plates, ...
1399    [the, buckling, shear, stress, of, simply, sup...
Name: text, Length: 1327, dtype: object

In [17]:
text.loc[0]
# len(text.loc[0])

['experimental',
 'investigation',
 'of',
 'the',
 'aerodynamics',
 'of',
 'wing',
 'in',
 'slipstream',
 'an',
 'experimental',
 'study',
 'of',
 'wing',
 'in',
 'propeller',
 'slipstream',
 'was',
 'made',
 'in',
 'order',
 'to',
 'determine',
 'the',
 'spanwise',
 'distribution',
 'of',
 'the',
 'lift',
 'increase',
 'due',
 'to',
 'slipstream',
 'at',
 'different',
 'angles',
 'of',
 'attack',
 'of',
 'the',
 'wing',
 'and',
 'at',
 'different',
 'free',
 'stream',
 'to',
 'slipstream',
 'velocity',
 'ratios',
 'the',
 'results',
 'were',
 'intended',
 'in',
 'part',
 'as',
 'an',
 'evaluation',
 'basis',
 'for',
 'different',
 'theoretical',
 'treatments',
 'of',
 'this',
 'problem',
 'the',
 'comparative',
 'span',
 'loading',
 'curves',
 'together',
 'with',
 'supporting',
 'evidence',
 'showed',
 'that',
 'substantial',
 'part',
 'of',
 'the',
 'lift',
 'increment',
 'produced',
 'by',
 'the',
 'slipstream',
 'was',
 'due',
 'to',
 'destalling',
 'or',
 'boundary',
 'layer',
 '

<h3>3.  Training the Word2Vec Model for this "Text" vocabulary</h3>

In [18]:
#initialize the model

model = gensim.models.Word2Vec(
    window=10,
    min_count=3,
    workers=4,
    vector_size= 50
)

In [19]:
#Build Vocabulary

model.build_vocab(text, progress_per=1000)

In [20]:
#Train the Word2Vec Model

model.train(text, total_examples=model.corpus_count, epochs=model.epochs)  #5 epochs

(726541, 1025970)

In [21]:
model.wv.most_similar("experimental")

[('data', 0.9634652733802795),
 ('theoretical', 0.9372055530548096),
 ('results', 0.9206819534301758),
 ('comparison', 0.9140534996986389),
 ('comparisons', 0.9042792916297913),
 ('agreement', 0.8890647292137146),
 ('compared', 0.8767828345298767),
 ('some', 0.858837366104126),
 ('good', 0.8488130569458008),
 ('with', 0.8470890522003174)]

In [22]:
#word embedding of experimental
model.wv["experimental"]

array([ 0.23934369, -0.7252128 , -0.58290356, -0.5528637 ,  0.7490001 ,
       -0.26915807,  1.5763988 ,  1.759875  , -1.1525388 , -1.1485881 ,
       -0.81376517, -1.6602665 , -0.42590088, -0.03395737, -0.2880086 ,
       -0.19477849,  1.1360688 ,  0.86494726, -1.0666451 , -1.8359363 ,
       -0.03320959, -0.98527515,  0.5241939 ,  0.7433294 ,  0.2649617 ,
       -0.32141587, -2.0356393 ,  0.665682  , -0.9272621 ,  0.99003273,
       -0.09443124,  0.33071414,  1.2861125 , -0.9409879 ,  0.9586617 ,
       -0.04462381,  1.1148787 , -1.5347735 ,  0.27425146, -0.2685301 ,
       -0.58698297,  0.3550569 ,  0.619771  ,  0.7756953 ,  2.6267266 ,
       -0.32815835,  0.738016  , -0.63965726, -0.9316468 , -0.71121055],
      dtype=float32)

<h3>4. Using Word Centroid Similarity (WCS) for comparing docs/text articles to queries<h3>

In [23]:
#calculate the centeroid for each text article

import numpy as np

a = [0.0]*50
df["centroid"] = [a]*df.shape[0]

for index, row in df.iterrows():

    centroid = np.array([0.0]*50)
    for word in text.loc[index]:
#         print(word,end=" ")
        try:
            b = model.wv[word]
#             print(b,end=" ")
        except:
            continue
        centroid = np.add(centroid, b)
    centroid /= len(text.loc[index])
    df.at[index,'centroid'] = centroid.tolist()
df.head()

Unnamed: 0,docno,title,author,bib,text,centroid
0,1,experimental investigation of the aerodynamics...,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,"[0.10740402801567336, -0.11062489738899538, -0..."
1,2,simple shear flow past a flat plate in an inco...,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,"[-0.3041015500703486, -0.16698943851615794, -0..."
2,3,the boundary layer in simple shear flow past a...,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,"[-0.7226809281855822, -0.2572224795197447, -0...."
3,4,approximate solutions of the incompressible la...,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,"[-0.5725004105024958, -0.2571714715585623, -0...."
4,5,one-dimensional transient heat conduction into...,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,"[0.10923023298382759, -0.6688481829501689, -0...."


<h3>5.  Ranking Documents to given Query</h3>

In [35]:
#This function will compare each text article with given query on basis of cosine simirality 
#and rank them according to their score

#based on word wise similarity of each word of the query
def rank_docs1(model, query, df, num=5) :
    
    cosine_list = []
    
    a = []
    query = query.split(" ")
    for q in query:
        try:
            a.append(model.wv[q])
        except:
            continue
    
    for index, row in df.iterrows():
        centroid = row['centroid']
        total_sim = 0
        for a_i in a:
            if np.linalg.norm(a_i) == 0 or np.linalg.norm(centroid) == 0:
                cos_sim = 0 
            else:
                cos_sim = np.dot(a_i, centroid)/(np.linalg.norm(a_i)*np.linalg.norm(centroid))
            total_sim += cos_sim
        cosine_list.append((row['docno'],row['title'],total_sim)) 
    cosine_list.sort(key=lambda x:x[2], reverse=True)  
     
    text_list = []
    for item in cosine_list[:num]:
        text_list.append((item[0], item[1], item[2]))
    return text_list

In [40]:
#based on content similarity of the query with that of document i.e comparing centroids of both

def rank_docs2(model, query, df, num=5) :
    #[(docno.,title,cosine_sim)]
    cosine_list = []
    
    a = []
    query = query.split(" ")
    for q in query:
        try:
            a.append(model.wv[q])
        except:
            continue
    
    #compute centroid of query
    query_centroid = np.array([0.0]*50)
    for b in a:
        query_centroid = np.add(query_centroid, b)
    query_centroid /= len(a)
    
    
    for index, row in df.iterrows():
        centroid = row['centroid']
        cos_sim = 0
        
        if np.linalg.norm(query_centroid) == 0 or np.linalg.norm(centroid) == 0:
            cos_sim = 0 
        else:
            cos_sim = np.dot(query_centroid, centroid)/(np.linalg.norm(query_centroid)*np.linalg.norm(centroid))
            
        cosine_list.append((row['docno'],row['title'],cos_sim)) 
    
    
    cosine_list.sort(key=lambda x:x[2], reverse=True)  
     
    text_list = []
    for item in cosine_list[:num]:
        text_list.append((item[0], item[1], item[2]))
    return text_list

In [41]:
#Loading Query dataset
df_query = pd.read_xml("cran.qry.xml")

In [42]:
df_query.head()

Unnamed: 0,num,title
0,1,what similarity laws must be obeyed when const...
1,2,what are the structural and aeroelastic proble...
2,4,what problems of heat conduction in composite ...
3,8,can a criterion be developed to show empirical...
4,9,what chemical kinetic system is applicable to ...


In [43]:
df_query.loc[0]['title']

'what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .'

In [45]:
for query in df_query[0:5]['title']:
    result_list1=rank_docs1(model, query, df)
    result_list2=rank_docs2(model, query, df)
    print("-----------------------------------------------------------------------------------------------")
    print("query: ",query)
    print("-----------------------------------------------------------------------------------------------")
    print("Result by function 1 i.e Word wise similartiy")
    for i in range(5):
        print(result_list1[i])

    print("Result")
    for i in range(5):
        print(result_list2[i])
    print()
    print()
    
        

-----------------------------------------------------------------------------------------------
query:  what similarity laws must be obeyed when constructing aeroelastic models
of heated high speed aircraft .
-----------------------------------------------------------------------------------------------
Result by function 1 i.e Word wise similartiy
(42, 'the gyroscopic effect of a rigid rotating propeller\non engine and wing vibration modes .', 10.088365312702471)
(834, 'limit design for economical missile structures .', 10.057714569316756)
(718, 'means and examples of aeronautical research in france at onera .', 10.05473679065516)
(768, 'formulae for use with the fatigue load meter in the\nassessment of wing fatigue life .', 10.045988215839664)
(640, 'the design of structures to resist jet noise fatigue .', 10.037618678627108)
Result
(1380, 'the problem of obtaining high lift-drag ratios at supersonic speeds .', 0.9757695832420161)
(811, 'an investigation of lifting effects on the int

In [46]:
#saving the model and dataset
model.save("./model.model")
df.to_pickle("./df.pkl")