In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import numpy as np
import pandas as pd
import re
import spacy
nlp=spacy.load('en_core_web_sm')

In [39]:
documents=pd.read_csv('/content/drive/My Drive/dataset/documents.csv')

In [40]:
documents.head()

Unnamed: 0,docid,author,bibliography,body,title
0,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
1,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
2,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...
3,6,"campbell,w.f.","j. ae. scs. 25, 1958, 340.",one-dimensional transient heat flow in a multi...,one-dimensional transient heat flow in a multi...
4,12,"bisplinghoff,r.l.","j. ae. scs. 23, 1956, 289.",some structural and aerelastic considerations ...,some structural and aerelastic considerations ...


In [41]:
documents.shape

(387, 5)

In [42]:
#loading queries
queries=pd.read_csv('/content/drive/My Drive/dataset/queries.csv')

In [43]:
queries.shape

(85, 2)

In [44]:
queries.head()

Unnamed: 0,qid,query
0,1,what similarity laws must be obeyed when const...
1,2,what are the structural and aeroelastic proble...
2,3,what problems of heat conduction in composite ...
3,8,what methods -dash exact or approximate -dash ...
4,10,are real-gas transport properties for air avai...


In [45]:
queries['query'][0]

'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .'

In [46]:
#loading qrel
qrel=pd.read_csv('/content/drive/My Drive/dataset/qrel.csv')

In [47]:
print(qrel.shape)

(425, 2)


In [48]:
qrel.head(10)

Unnamed: 0,qid,docid
0,1,184
1,1,29
2,1,31
3,1,57
4,1,378
5,2,12
6,2,746
7,2,15
8,2,184
9,2,858


In [49]:
#Importing validation queries and Qrel

In [50]:
#loading validation queries
queries_val=pd.read_csv('/content/drive/My Drive/dataset/queries_val.csv')

In [51]:
queries_val.shape

(22, 2)

In [52]:
queries_val.head()


Unnamed: 0,qid,query
0,189,is there a design method for calculating therm...
1,190,will an analysis of panel flutter based on arb...
2,191,"what is the criterion for true panel flutter, ..."
3,194,how can the analytical solution of the bucklin...
4,196,the problem of similarity for representative i...


In [84]:
# Loading qrel
qrel_val=pd.read_csv('/content/drive/My Drive/dataset/qrel_val.csv')
print('Shape=>',qrel_val.shape)
qrel_val.head(10)

Shape=> (110, 2)


Unnamed: 0,qid,docid
0,189,395
1,189,866
2,189,869
3,189,865
4,189,868
5,190,15
6,190,391
7,190,285
8,190,390
9,190,864


In [53]:
#Reading samples
#reading sample queries
queries['query'].sample(10).values

array(['what are the flutter characteristics of the exposed skin panels of the x-15 vertical stabilizer when subjected to aerodynamic heating .',
       'what is the theoretical heat transfer rate at the stagnation point of a blunt body .',
       'what analytical investigations have been made of the stability of conical shells . how do the results compare with experiment .',
       'what is the effect of cross sectional shape on the flow over simple delta wings with sharp leading edges .',
       'how is the heat transfer downstream of the mass transfer region effected by mass transfer at the nose of a blunted cone .',
       'what agreement is found between theoretically predicted instability times and experimentally measured collapse times for compressed columns in creep .',
       'can the transonic flow around an arbitrary smooth thin airfoil be analysed in a simple approximate way .',
       'what analytical solutions are available for stresses in edge-loaded shells of revolution

In [54]:
documents['body'][10:15].values

array(['a simple model study of transient temperature and thermal stress distribution due to aerodynamic heating .   the present work is concerned with the determination of transient temperatures and thermal stresses in simple models intended to simulate parts or the whole of an aircraft structure of the built- up variety subjected to aerodynamic heating .   the first case considered is that of convective heat transfer into one side of a flat plate, representing a thick skin, and the effect of the resulting temperature distribution in inducing thermal stresses associated with bending restraint at the plate edges . numerical results are presented for the transient temperature differentials in the plate when the environment temperature first increases linearly with time and then remains constant, the period of linear increase representing the time of acceleration of the aircraft .  corresponding thermal stress information is presented .   the second case is that of the wide-flanged i-bea

In [55]:
#Text Pre-processing
def preprocess(text):
  #split on hyphen
  text=re.sub("-"," ",text)
  text=re.sub("[^a-z ]+","",text)
  text=re.sub("[\s]+"," ",text)
  doc=nlp(text)
  #remove stopwords and lemmatize the text
  tokens=[token.lemma_ for token in doc if(token.is_stop==False)]
  return tokens

In [56]:
#pre-processing documents
documents['tokens']=documents['body'].apply(preprocess)
documents.head()

Unnamed: 0,docid,author,bibliography,body,title,tokens
0,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...,"[simple, shear, flow, past, flat, plate, incom..."
1,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...,"[boundary, layer, simple, shear, flow, past, f..."
2,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...,"[dimensional, transient, heat, conduction, dou..."
3,6,"campbell,w.f.","j. ae. scs. 25, 1958, 340.",one-dimensional transient heat flow in a multi...,one-dimensional transient heat flow in a multi...,"[dimensional, transient, heat, flow, multilaye..."
4,12,"bisplinghoff,r.l.","j. ae. scs. 23, 1956, 289.",some structural and aerelastic considerations ...,some structural and aerelastic considerations ...,"[structural, aerelastic, consideration, high, ..."


In [57]:
#pre-processing documents
documents['tokens']=documents['body'].apply(preprocess)

In [58]:
#pre-process queries
queries['tokens']=queries['query'].apply(preprocess)

In [59]:
queries.head()

Unnamed: 0,qid,query,tokens
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic..."
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ..."
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s..."
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese..."
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl..."


In [60]:
queries_val['tokens']=queries_val['query'].apply(preprocess)

In [61]:
queries_val.head()

Unnamed: 0,qid,query,tokens
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ..."
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ..."
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal..."
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo..."
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig..."


In [62]:
#Ranking doc and evaluation using MAP
#jaccard coefficient


In [63]:
#temporary dataframe
temp_doc=documents[['docid','tokens']].copy()

In [64]:
def jaccard_coefficient(dtokens,qtokens):
  numerator=len(set(dtokens).intersection(set(qtokens)))
  denominator=len(set(dtokens).union(set(qtokens)))
  return numerator/denominator

In [65]:
jaccard_coefficient(temp_doc['tokens'][0],queries['tokens'][0])

0.02702702702702703

In [66]:
#getting jaccard coefficient for all the documents for against a sample query
temp_doc['jaccard']=temp_doc['tokens'].apply(lambda x: jaccard_coefficient(x,queries['tokens'][0]))
temp_doc.head(10)

Unnamed: 0,docid,tokens,jaccard
0,2,"[simple, shear, flow, past, flat, plate, incom...",0.027027
1,3,"[boundary, layer, simple, shear, flow, past, f...",0.0
2,5,"[dimensional, transient, heat, conduction, dou...",0.028571
3,6,"[dimensional, transient, heat, flow, multilaye...",0.020408
4,12,"[structural, aerelastic, consideration, high, ...",0.084746
5,15,"[dimensional, panel, flutter, theory, experime...",0.0
6,16,"[transformation, compressible, turbulent, boun...",0.0
7,21,"[heat, transfer, slip, flow, number, author, c...",0.030303
8,23,"[skin, friction, heat, transfer, characteristi...",0.017857
9,24,"[theory, stagnation, point, heat, transfer, di...",0.032609


In [69]:
#DocIDs of the top 10 Most relevant documents
temp_doc.sort_values(by='jaccard',ascending=False).head(10).reset_index(drop=True)

Unnamed: 0,docid,tokens,jaccard
0,12,"[structural, aerelastic, consideration, high, ...",0.084746
1,51,"[theory, aircraft, structural, model, subject,...",0.084746
2,378,"[engineering, relation, friction, heat, transf...",0.073171
3,670,"[blunt, body, heat, transfer, hypersonic, spee...",0.066667
4,875,"[model, aeroelastic, investigation, addendum, ...",0.066667
5,184,"[scale, model, thermo, aeroelastic, research, ...",0.057971
6,1111,"[research, high, speed, flutter, paper, presen...",0.057143
7,436,"[heat, transfer, planetary, atmosphere, super,...",0.055556
8,629,"[second, order, effect, laminar, boundary, lay...",0.055556
9,1305,"[propose, programme, wind, tunnel, test, hyper...",0.055556


In [71]:
#DocIDs of top 5 most relevant documents
temp_doc.sort_values(by='jaccard',ascending=False).head()['docid'].values

array([ 12,  51, 378, 670, 875])

In [72]:
#function for finding jaccard_coefficient
def jaccard_rank(qtokens):
  temp_doc['jaccard']=temp_doc['tokens'].apply(lambda x:jaccard_coefficient(x,qtokens))
  relevant_docids=temp_doc.sort_values(by='jaccard',ascending=False).head()['docid'].values
  return relevant_docids

In [73]:
#Ranking documents according to jaccard coefficient
queries['jaccard_rel']=queries['tokens'].apply(lambda x:jaccard_rank(x))
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[12, 51, 378, 670, 875]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 583, 616]"


In [74]:
#Evaluation on train set
#adding ground truth in a column
queries['ground_truth']=queries['qid'].apply(lambda x:qrel[qrel['qid']==x]['docid'].values)
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[12, 51, 378, 670, 875]","[184, 29, 31, 57, 378]"
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]"
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]"
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]"
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 583, 616]","[259, 405, 302, 436, 437]"


:\\[\Large Average\,Precision=\frac{\sum_{k=1}^{n}(P(k) \times rel(k))}{No.\,of\,relevant\,documents}\\]

In [75]:
def average_precision(model_rel,ground_truth):
  tp=0
  precisions=[]
  #finding precision at positions at which relevant document is returned
  for index,value in enumerate(model_rel):
    if value in ground_truth:
      tp+=1
      precisions.append(tp/(index+1))
  if precisions==[]:
    return 0
  return np.mean(precisions)

In [77]:
average_precision([5,6,1,2,4],[1,2,3,4,5])

0.8041666666666667

In [78]:
queries['jaccard_ap']=queries.apply(lambda x:average_precision(x['jaccard_rel'],x['ground_truth']),axis=1)

In [79]:
queries.head()

Unnamed: 0,qid,query,tokens,jaccard_rel,ground_truth,jaccard_ap
0,1,what similarity laws must be obeyed when const...,"[similarity, law, obey, construct, aeroelastic...","[12, 51, 378, 670, 875]","[184, 29, 31, 57, 378]",0.333333
1,2,what are the structural and aeroelastic proble...,"[structural, aeroelastic, problem, associate, ...","[12, 51, 700, 746, 875]","[12, 746, 15, 184, 858]",0.75
2,3,what problems of heat conduction in composite ...,"[problem, heat, conduction, composite, slab, s...","[5, 584, 6, 145, 582]","[5, 6, 90, 91, 119]",0.833333
3,8,what methods -dash exact or approximate -dash ...,"[method, dash, exact, approximate, dash, prese...","[122, 1306, 639, 655, 988]","[48, 122, 354, 360, 1005]",1.0
4,10,are real-gas transport properties for air avai...,"[real, gas, transport, property, air, availabl...","[405, 302, 436, 583, 616]","[259, 405, 302, 436, 437]",1.0


In [81]:
#finding mean average precision
queries['jaccard_ap'].mean()

0.49555555555555564

In [82]:
#Evaluation on validation set


In [85]:
#adding ground truth in a column
queries_val['ground_truth']=queries_val['qid'].apply(lambda x:qrel_val[qrel_val['qid']==x]['docid'].values)

In [86]:
#ranking documents accourding to jaccard coefficient
queries_val['jaccard_rel']=queries_val['tokens'].apply(lambda x:jaccard_rank(x))

In [87]:
#finding average precision for each query
queries_val['jaccard_ap']=queries_val.apply(lambda x:average_precision(x['jaccard_rel'],x['ground_truth']),axis=1)

In [88]:
queries_val.head()

Unnamed: 0,qid,query,tokens,ground_truth,jaccard_rel,jaccard_ap
0,189,is there a design method for calculating therm...,"[design, method, calculate, thermal, fatigue, ...","[395, 866, 869, 865, 868]","[868, 1306, 833, 906, 909]",1.0
1,190,will an analysis of panel flutter based on arb...,"[analysis, panel, flutter, base, arbitrarily, ...","[15, 391, 285, 390, 864]","[390, 1008, 285, 21, 391]",0.755556
2,191,"what is the criterion for true panel flutter, ...","[criterion, true, panel, flutter, oppose, smal...","[914, 915, 285, 857, 858]","[285, 31, 864, 728, 15]",1.0
3,194,how can the analytical solution of the bucklin...,"[analytical, solution, buckle, strength, unifo...","[739, 740, 742, 743, 744]","[932, 744, 1050, 1172, 1171]",0.5
4,196,the problem of similarity for representative i...,"[problem, similarity, representative, investig...","[51, 185, 874, 875, 876]","[875, 1008, 184, 864, 655]",1.0


In [89]:
#finding mean average precision
print('MAP',queries_val['jaccard_ap'].mean())

MAP 0.4431818181818181


# TERM FREQUENCY




