In [1]:
!pip install python-terrier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-terrier
  Downloading python-terrier-0.9.1.tar.gz (102 kB)
[K     |████████████████████████████████| 102 kB 30.2 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius>=1.4.2
  Downloading pyjnius-1.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 30.2 MB/s 
[?25hCollecting matchpy
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 8.3 MB/s 
[?25hCollecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Collecting deprecated
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting chest
  Downloading chest-0.2.3.tar.gz (9.6 kB)
Collecting nptyping==1.4.4
  Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)
Collecting ir_datasets>=0.3.2
  Downloading ir_datasets-0.5.4-py3-none-any.whl (311 kB)
[K     

In [2]:
import pandas as pd
# Helpful for showing indexing information
pd.set_option('display.max_colwidth', 150)

import pyterrier as pt
import os

In [3]:
if not pt.started():
    pt.init()

terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [9]:
df = pd.read_csv('amazon_data.csv', usecols=["productTitle", "productDescription", "URL"], header=0)
df['docno'] = 'd' + (df.reset_index().index).astype(str)
# df.columns = ["productTitle", "productDescription", "docno"] 
df['text'] = df[['productTitle', 'productDescription']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
print(df.head(5))

                                                                                                                                            productTitle  \
0                      Eyeshadow Palette by Almay Longlasting Eye Makeup Primer Enriched with Antioxidant Vitamin E Hypoallergenic 010 Brown Eyes 0.1 Oz   
1                                      Maybelline New York Shadow Blocks Shadow Blocks Eyeshadow Palette Stacked Eye Shadow Trio 82nd & Park Ave 0.08 oz   
2                                                                                                      L.A. Girl Beauty Brick Eyeshadow Nudes 0.42 Ounce   
3  Highly Pigmented Eye Makeup Palette  Matte Shimmer Metallic Eyeshadow Pallet Long Lasting Blendable Natural Colors Make Up Eye Shadows Cosmetics G...   
4  Urban Decay Naked2 Basics Eyeshadow Palette 6 Taupe & Brown Matte Neutral Shades - Ultra-Blendable Rich Colors with Velvety Texture - Makeup Set I...   

                                                               

In [10]:
index_dir = os.path.abspath('amazonindex')
indexer = pt.DFIndexer(index_dir, overwrite=True)
index_ref = indexer.index(df["text"], df["docno"], )
index_ref.toString()

05:24:08.878 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 505 empty documents


'/content/amazonindex/data.properties'

In [11]:
index = pt.IndexFactory.of(index_ref)

In [12]:
print(index.getCollectionStatistics().toString())

Number of documents: 903
Number of terms: 2037
Number of postings: 10969
Number of fields: 0
Number of tokens: 13598
Field names: []
Positions:   false



In [13]:
br_bm25 = pt.BatchRetrieve(index, wmodel="BM25")
br_bm25.search("Maybelline")
br_tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
br_pl2 = pt.BatchRetrieve(index, wmodel="PL2")

In [14]:
qdf = pd.read_csv("queries.csv", usecols=['qid', 'query'], header=0)
print(qdf)

    qid                             query
0     1              smudge free eyeliner
1     2    moisturiser for sensitive skin
2     3          all skin type foundation
3     4         cruelty free lipstick red
4     5                  smudge free kohl
5     6               waterproof lipstick
6     7   all in one foundation concealer
7     8      soft nudes eyeshadow palette
8     9                red nailpaints set
9    10             long lasting lipstick
10   11  eyebrow pencil set multiple size
11   12             smudge free nailpaint
12   13          highlighting moisturizer
13   14           smoky eyeshadow palette
14   15       best bridal makeup products
15   16     bronzer palette for dark skin
16   17               light blush palette
17   18              cruelty free mascara
18   19         mascara eyeliner kohl set
19   20                   full makeup set


In [15]:
res_bm25 = br_bm25(qdf)
mres_bm25 = res_bm25.groupby('qid').head(50)

res_tfidf = br_tfidf(qdf)
mres_tfidf = res_tfidf.groupby('qid').head(50)

res_pl2 = br_pl2(qdf)
mres_pl2 = res_pl2.groupby('qid').head(50)

In [16]:
newdf = pd.concat([mres_bm25,mres_tfidf,mres_pl2]).drop_duplicates(subset = ['docno', 'query'],
  keep = 'last').reset_index(drop=True)

In [17]:
print(newdf)

     qid  docid docno  rank     score                            query
0      5    291  d291    48  2.926679                 smudge free kohl
1      5    353  d353    49  2.926679                 smudge free kohl
2      7    182  d182    49  3.874830  all in one foundation concealer
3     15      2    d2    38  2.997432      best bridal makeup products
4     20    242  d242    49  3.758191                  full makeup set
...   ..    ...   ...   ...       ...                              ...
1045  20    867  d867    45  2.125177                  full makeup set
1046  20     56   d56    46  2.071672                  full makeup set
1047  20    100  d100    47  2.063838                  full makeup set
1048  20    655  d655    48  1.834009                  full makeup set
1049  20     72   d72    49  1.828804                  full makeup set

[1050 rows x 6 columns]


In [18]:
newdf.to_csv('results.csv',index=False,columns=['query','docno'])

In [19]:
df.to_csv('reference.csv',index=False,columns=['docno','URL', 'text'])