# Part 2: Indexing and Evaluation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Imports
from collections import defaultdict
from array import array
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import pandas as pd
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import spacy


nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import json
import csv


#Path of the resulting JSON file of part 1
docs_path = '/content/drive/Shareddrives/IRWA/IRWA_data_2023/Rus_Ukr_war_data_clean.json'
csv_path = '/content/drive/Shareddrives/IRWA/IRWA_data_2023/Rus_Ukr_war_data_ids.csv'

doc_ids = []

#First we will load the data
with open(csv_path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        doc_id = row[0]
        doc_ids.append(doc_id)

#Next we will load the ids into a map
with open(docs_path) as doc_fp:
    tweets = [json.loads(line) for line in doc_fp.readlines()]

#Get the tweet IDs for all tweets
for tweet, doc_id in zip(tweets, doc_ids):
    tweet['id'] = doc_id

tweets[430]


{'full_text_tokenized': ['artilleri',
  'ukrain',
  '#russian',
  '#russia',
  '#ukrain',
  '#ukrainian',
  '#ukrainewar',
  '#ukrainerussiawar',
  '#russiaisaterroristst',
  'https//tco/oyvpwjcxz'],
 'created_at': 1664548988000,
 'hashtags': ['Russians',
  'Russia',
  'Ukraine',
  'Ukrainian',
  'UkraineWar',
  'UkraineRussiaWar',
  'RussiaIsATerroristState'],
 'favorite_count': 26,
 'retweet_count': 4,
 'url': 'https://twitter.com/infussambas/status/1575858789647286272',
 'id': 'doc_431'}

## Indexing

### Creating the index

In [4]:
def create_index(tweets):
    """
    Implement the inverted index

    Argument:
    tweets -- collection of tweets

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)

    for idx, tweet in enumerate(tweets):
        tweet_id = idx+1 #calculate tweet_id
        terms = tweet["full_text_tokenized"] #get full tokenized text of the tweet

        current_page_index = {} #create a dictionary

        #loop through all terms in tokenized text
        for position, term in enumerate(terms):
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list

                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)


    return index

In [5]:
import time
#Create indexes for the tweets
start_time = time.time()
index = create_index(tweets)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.3 seconds


In [6]:
#Find positions in the documents where the term 'fascist' appears
print("Index results for the term 'fascist': {}\n".format(index['fascist']))
print("First 10 Index results for the term 'fascist': \n{}".format(index['fascist'][:10]))

Index results for the term 'fascist': [[1, array('I', [4])], [136, array('I', [7])], [230, array('I', [3])], [240, array('I', [11])], [413, array('I', [1])], [437, array('I', [7])], [2403, array('I', [4])], [2954, array('I', [1])], [3853, array('I', [4])]]

First 10 Index results for the term 'fascist': 
[[1, array('I', [4])], [136, array('I', [7])], [230, array('I', [3])], [240, array('I', [11])], [413, array('I', [1])], [437, array('I', [7])], [2403, array('I', [4])], [2954, array('I', [1])], [3853, array('I', [4])]]


### Querying

In [7]:
def remove_emojis(text):
    """
    Remove all emojis from the given text

    Argument:
    text -- the text to clean

    Returns:
    text - the same text without emojis
    """

    #All emojis are represented in Unicode, so we can remove them by using RE
    #Regular expression found at https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emojis_expression = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

    return re.sub(emojis_expression, "", text)

def preprocess(text):
    """
    Clean and tokenize the text

    Argument:
    text -- the text to clean

    Returns:
    text - the list of tokens obtained from the input text.
    """
    stemmer = PorterStemmer()
    stop_words = stopwords.words("english")
    punctuation = r"[!¡?¿.:,;()\[\]{}\-\'\"]"

    text = remove_emojis(text) #Remove emojis
    text = text.strip() #Remove whitespaces at the beginning and at the end
    text = text.lower() ## Transform in lowercase
    text = re.sub(punctuation, "", text) ## Eliminate puntuation marks
    text = text.split() ## Tokenize the text to get a list of terms
    text = [token for token in text if token not in stop_words] ## Eliminate the stopwords
    text = [stemmer.stem(token) for token in text] ## Perform stemming

    return text

In [8]:
def search(query, index):
    """
    The output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:

    query -- name of the query
    index -- created index


    Returns:
    docs - list of tweets IDs and url of the term of the query
    """

    #first we clean and tokenize the query
    query = preprocess(query)
    docs = set()
    #loop trough every word in the query
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]
            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    return docs

In [74]:
#Search the tweet id of the first query
query1 = "Zelenski"
docs = search(query1, index)
#Get 20 samples of tweets
top = 20
id_to_tweet = {tweet['id']: id for id, tweet in enumerate(tweets)}

print("\n======================\nSample of {} results out of {} for the query {}:\n".format(top, len(docs), query1))
for d_id in docs[:top]:
     print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Sample of 20 results out of 36 for the query Zelenski:

tweet id = 131 - tweet: https://twitter.com/FreeCiviliansUA/status/1575905170952982528
tweet id = 3210 - tweet: https://twitter.com/FreeCiviliansUA/status/1575272563902746624
tweet id = 2318 - tweet: https://twitter.com/Militarylandnet/status/1575510001594015744
tweet id = 783 - tweet: https://twitter.com/Conor_Dempsey/status/1575820638362574848
tweet id = 3087 - tweet: https://twitter.com/slim_swede/status/1575315693905580032
tweet id = 1300 - tweet: https://twitter.com/AnnevelinkD/status/1575759357370736640
tweet id = 150 - tweet: https://twitter.com/Kegan653/status/1575902847216095232
tweet id = 408 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575861873060499456
tweet id = 411 - tweet: https://twitter.com/knittingknots/status/1575861425133981696
tweet id = 2078 - tweet: https://twitter.com/G700Sway/status/1575573525024251904
tweet id = 287 - tweet: https://twitter.com/Shadi_Alkasim/status/1575885728701026304
tweet id = 4

In [75]:
#Second query
query2 = "Crimea"
docs = search(query2, index)
top = 20

print("\n======================\nSample of {} results out of {} for the query {}:\n".format(top, len(docs), query2))
for d_id in docs[:top]:
     print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Sample of 20 results out of 26 for the query Crimea:

tweet id = 3856 - tweet: https://twitter.com/K13News/status/1575173336665300992
tweet id = 2197 - tweet: https://twitter.com/DarwinBedford/status/1575538366179057664
tweet id = 2453 - tweet: https://twitter.com/infussambas/status/1575476108077985792
tweet id = 2336 - tweet: https://twitter.com/knittingknots/status/1575505128337309696
tweet id = 3361 - tweet: https://twitter.com/Odessa_Journal/status/1575241438937239552
tweet id = 170 - tweet: https://twitter.com/NBRnews/status/1575900221141241856
tweet id = 3883 - tweet: https://twitter.com/andreleoso/status/1575168930389569536
tweet id = 175 - tweet: https://twitter.com/knittingknots/status/1575899953456877568
tweet id = 3506 - tweet: https://twitter.com/Daily_Express/status/1575211259833864192
tweet id = 59 - tweet: https://twitter.com/K13News/status/1575910964112740352
tweet id = 317 - tweet: https://twitter.com/ukrainewar24/status/1575880417286160384
tweet id = 321 - tweet: htt

In [76]:
#Third query
query3 = "Civilians"
docs = search(query3, index)
top = 20

print("\n======================\nSample of {} results out of {} for the query {}:\n".format(top, len(docs), query3))
for d_id in docs[:top]:
     print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Sample of 20 results out of 88 for the query Civilians:

tweet id = 1025 - tweet: https://twitter.com/MrFukkew/status/1575800190669586432
tweet id = 1027 - tweet: https://twitter.com/Ventured446/status/1575800139658440704
tweet id = 3593 - tweet: https://twitter.com/knittingknots/status/1575201082279862272
tweet id = 1040 - tweet: https://twitter.com/MoRaY1959/status/1575798982735204352
tweet id = 3134 - tweet: https://twitter.com/knittingknots/status/1575301220390129664
tweet id = 67 - tweet: https://twitter.com/QUICKMAGAZIN/status/1575909988618231808
tweet id = 1110 - tweet: https://twitter.com/oimmao/status/1575793139595427840
tweet id = 1120 - tweet: https://twitter.com/MxmLurie/status/1575791448431030272
tweet id = 1121 - tweet: https://twitter.com/MrFukkew/status/1575791204855607296
tweet id = 615 - tweet: https://twitter.com/ActForUA/status/1575830547472912384
tweet id = 3183 - tweet: https://twitter.com/W7VOA/status/1575283090162860032
tweet id = 637 - tweet: https://twitter.c

In [77]:
#Fourth query
query4 = "NATO"
docs = search(query4, index)
top = 20

print("\n======================\nSample of {} results out of {} for the query {}:\n".format(top, len(docs), query4))
for d_id in docs[:top]:
     print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Sample of 20 results out of 121 for the query NATO:

tweet id = 6 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575917759707299840
tweet id = 7 - tweet: https://twitter.com/A0zeir/status/1575917751360593920
tweet id = 2569 - tweet: https://twitter.com/CuriosityGuide/status/1575458088760516608
tweet id = 27 - tweet: https://twitter.com/Amannatt/status/1575914279295156224
tweet id = 2590 - tweet: https://twitter.com/CuriosityGuide/status/1575457685679513600
tweet id = 2595 - tweet: https://twitter.com/CuriosityGuide/status/1575457536387563520
tweet id = 2597 - tweet: https://twitter.com/CuriosityGuide/status/1575457478745153536
tweet id = 3629 - tweet: https://twitter.com/raffles2004/status/1575196069683597312
tweet id = 50 - tweet: https://twitter.com/casulers/status/1575911896447913984
tweet id = 54 - tweet: https://twitter.com/TrimiA98/status/1575911455806742528
tweet id = 69 - tweet: https://twitter.com/_BrandonSulliva/status/1575909930481205248
tweet id = 74 - tweet: https://t

In [13]:
#Fifth query
query5 = "Negotiations"
docs = search(query5, index)
top = 20

print("\n======================\nSample of {} results out of {} for the query {}:\n".format(top, len(docs), query5))
for d_id in docs[:top]:
     print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Sample of 20 results out of 37 for the query Negotiations:

tweet id = 515 - tweet: https://twitter.com/u_me_reality/status/1575845160134451200
tweet id = 643 - tweet: https://twitter.com/firamnews/status/1575828204702806016
tweet id = 1799 - tweet: https://twitter.com/The_Old_Hippie/status/1575633805288779776
tweet id = 659 - tweet: https://twitter.com/_Nex3_/status/1575827009048956928
tweet id = 3991 - tweet: https://twitter.com/FPVDroneNerd/status/1575155283613155328
tweet id = 24 - tweet: https://twitter.com/Shadi_Alkasim/status/1575914711388139520
tweet id = 415 - tweet: https://twitter.com/Chronology22/status/1575861001769013248
tweet id = 3872 - tweet: https://twitter.com/1002spvr/status/1575170727904309248
tweet id = 417 - tweet: https://twitter.com/Chronology22/status/1575860740556460032
tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840
tweet id = 3878 - tweet: https://twitter.com/TimeToWakeUp247/status/1575169694822445056
tweet id = 41 - tweet: 

### Ranking

In [14]:
def create_index_tfidf(tweets, num_documents):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    tweets -- collection of tweeets
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    title_index = defaultdict(str)
    idf = defaultdict(float)

    for idx, tweet in enumerate(tweets):  #Iterate through the tweets
        tweet_id = idx+1 #calculate tweet_id
        terms = tweet["full_text_tokenized"] #get full tokenized text of the tweet

        current_page_index = {}

        for position, term in enumerate(terms):
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[tweet_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4))
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] = df[term] + 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return index, tf, df, idf


In [15]:
index

defaultdict(list,
            {'@melsimmonsfcdo': [[1, array('I', [0])]],
             'wrong': [[1, array('I', [1])],
              [25, array('I', [10])],
              [256, array('I', [1, 3])],
              [327, array('I', [6])],
              [445, array('I', [4])],
              [1322, array('I', [4])],
              [1498, array('I', [6])],
              [1512, array('I', [0])],
              [1596, array('I', [6])],
              [2093, array('I', [8])],
              [2157, array('I', [1])],
              [2207, array('I', [8])],
              [2236, array('I', [6])],
              [2237, array('I', [6])],
              [3714, array('I', [4])],
              [3844, array('I', [5])],
              [3900, array('I', [6])],
              [3947, array('I', [8])],
              [3994, array('I', [3])]],
             'dictat': [[1, array('I', [2])],
              [199, array('I', [2])],
              [225, array('I', [1])],
              [240, array('I', [6])],
              [666,

In [16]:
#Create the tf-idf index
start_time = time.time()
num_documents = len(tweets)
index, tf, df, idf = create_index_tfidf(tweets, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 241.22 seconds


In [17]:
def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies

    Returns:
    doc_scores -- list of scores
    """

    doc_vectors = defaultdict(lambda: [0] * len(terms)) # Call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]=query_terms_count[term]/idf[term] * query_norm

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):

            #tf[term][0] will contain the tf of the term "term" in the doc 26
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]
    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:

    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)

    return doc_scores

In [18]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    doc_scores = rank_documents(query, docs, index, idf, tf)
    return doc_scores

In [78]:
#Search tf-idf of query 1
query1 = "Zelenski"
ranked_docs1 = search_tf_idf(query1, index)
top = 20

print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(ranked_docs1), query1))
for _, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))



Top 20 results out of 36 for the query 'Zelenski':

tweet id = 451 - tweet: https://twitter.com/Militarylandnet/status/1575855193547165696
tweet id = 734 - tweet: https://twitter.com/knittingknots/status/1575823179859333120
tweet id = 474 - tweet: https://twitter.com/_Nex3_/status/1575851734974578688
tweet id = 430 - tweet: https://twitter.com/knittingknots/status/1575858809842503680
tweet id = 408 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575861873060499456
tweet id = 3271 - tweet: https://twitter.com/knittingknots/status/1575255448818696192
tweet id = 447 - tweet: https://twitter.com/Magnifyingglas_/status/1575855639011549184
tweet id = 3270 - tweet: https://twitter.com/knittingknots/status/1575255530154737664
tweet id = 2784 - tweet: https://twitter.com/AlbaneseAl/status/1575405656772935680
tweet id = 2782 - tweet: https://twitter.com/AGCsegreteria/status/1575405725936934912
tweet id = 2781 - tweet: https://twitter.com/pka71/status/1575405776260268032
tweet id = 411 - twee

In [79]:
#Search tf-idf of query 2
query2 = "Crimea"
ranked_docs2 = search_tf_idf(query2, index)
top = 20

print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(ranked_docs2), query2))
for _, d_id in ranked_docs2[:top]:
        print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))


Top 20 results out of 26 for the query 'Crimea':

tweet id = 3361 - tweet: https://twitter.com/Odessa_Journal/status/1575241438937239552
tweet id = 106 - tweet: https://twitter.com/anthony51483709/status/1575907497017487360
tweet id = 1109 - tweet: https://twitter.com/KohnNatan/status/1575793173062127616
tweet id = 2453 - tweet: https://twitter.com/infussambas/status/1575476108077985792
tweet id = 3281 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575253795352809472
tweet id = 354 - tweet: https://twitter.com/UKikaski/status/1575870743631450112
tweet id = 3506 - tweet: https://twitter.com/Daily_Express/status/1575211259833864192
tweet id = 175 - tweet: https://twitter.com/knittingknots/status/1575899953456877568
tweet id = 59 - tweet: https://twitter.com/K13News/status/1575910964112740352
tweet id = 2336 - tweet: https://twitter.com/knittingknots/status/1575505128337309696
tweet id = 3649 - tweet: https://twitter.com/knittingknots/status/1575193723364950016
tweet id = 350 - tweet

In [80]:
#Search tf-idf of query 3
query3 = "Civilians"
ranked_docs3 = search_tf_idf(query3, index)
top = 20

print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(ranked_docs3), query3))
for _, d_id in ranked_docs3[:top]:
        print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))


Top 20 results out of 88 for the query 'Civilians':

tweet id = 778 - tweet: https://twitter.com/iduxking/status/1575821079590772736
tweet id = 3853 - tweet: https://twitter.com/K13News/status/1575174008219504640
tweet id = 908 - tweet: https://twitter.com/WeAreFromUA/status/1575811723293884416
tweet id = 1120 - tweet: https://twitter.com/MxmLurie/status/1575791448431030272
tweet id = 1110 - tweet: https://twitter.com/oimmao/status/1575793139595427840
tweet id = 878 - tweet: https://twitter.com/LynBank25442089/status/1575814023517720576
tweet id = 774 - tweet: https://twitter.com/iduxking/status/1575821309937860608
tweet id = 725 - tweet: https://twitter.com/obtener/status/1575823647909941248
tweet id = 3134 - tweet: https://twitter.com/knittingknots/status/1575301220390129664
tweet id = 1917 - tweet: https://twitter.com/KolbasenkOleg/status/1575612804647854080
tweet id = 3593 - tweet: https://twitter.com/knittingknots/status/1575201082279862272
tweet id = 1908 - tweet: https://twitte

In [81]:
#Search tf-idf of query 4
query4 = "NATO"
ranked_docs4 = search_tf_idf(query4, index)
top = 20

print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(ranked_docs4), query4))
for _, d_id in ranked_docs4[:top]:
        print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))


Top 20 results out of 121 for the query 'NATO':

tweet id = 183 - tweet: https://twitter.com/NewsZapper/status/1575899185202724864
tweet id = 176 - tweet: https://twitter.com/chronosome/status/1575899801933185024
tweet id = 2545 - tweet: https://twitter.com/LynBank25442089/status/1575462118475939840
tweet id = 2540 - tweet: https://twitter.com/LynBank25442089/status/1575462473058205696
tweet id = 1827 - tweet: https://twitter.com/Javelinzz/status/1575629399151370240
tweet id = 83 - tweet: https://twitter.com/elevotv/status/1575909051380432896
tweet id = 3489 - tweet: https://twitter.com/GabeZZOZZ/status/1575213861359480832
tweet id = 1506 - tweet: https://twitter.com/Jimwolfie1/status/1575692255612248064
tweet id = 3483 - tweet: https://twitter.com/nextgenwarss/status/1575215168422023168
tweet id = 54 - tweet: https://twitter.com/TrimiA98/status/1575911455806742528
tweet id = 800 - tweet: https://twitter.com/Ak1nc1Turk1071/status/1575819273078906880
tweet id = 784 - tweet: https://twi

In [82]:
#Search tf-idf of query 5
query5 = "Negotiations"
ranked_docs5 = search_tf_idf(query5, index)
top = 20

print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(ranked_docs5), query5))
for _, d_id in ranked_docs5[:top]:
        print("tweet id = {} - tweet: {}".format(d_id, tweets[d_id-1]["url"]))


Top 20 results out of 37 for the query 'Negotiations':

tweet id = 1218 - tweet: https://twitter.com/PatilSushmit/status/1575776384324374528
tweet id = 699 - tweet: https://twitter.com/Amannatt/status/1575825005291524096
tweet id = 515 - tweet: https://twitter.com/u_me_reality/status/1575845160134451200
tweet id = 24 - tweet: https://twitter.com/Shadi_Alkasim/status/1575914711388139520
tweet id = 3305 - tweet: https://twitter.com/MaoistRebelNews/status/1575247684621012992
tweet id = 687 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575825370061844480
tweet id = 616 - tweet: https://twitter.com/infussambas/status/1575830518503198720
tweet id = 583 - tweet: https://twitter.com/SGNewsAlerts/status/1575835852152147968
tweet id = 349 - tweet: https://twitter.com/UKikaski/status/1575871713317105664
tweet id = 3123 - tweet: https://twitter.com/GranataLLC/status/1575304304004562944
tweet id = 659 - tweet: https://twitter.com/_Nex3_/status/1575827009048956928
tweet id = 417 - tweet: https

## Evaluation

### With the given queries

In [24]:
#Load the evaluation_gt.csv
evaluation_df = pd.read_csv("/content/drive/Shareddrives/IRWA/IRWA_data_2023/Evaluation_gt.csv")

#Get the ids of the doc in evaluation
sample_ids = evaluation_df["doc"]
evaluation_df.head()
print(len(np.array(sample_ids)))

60


In [25]:
#Take the tweets sample that appears in the evaluation file
tweet_sample = [tweet for tweet in tweets if tweet['id'] in np.array(sample_ids)]
len(tweet_sample)

59

There is one less doc in tweet_sample since DOC ID 3478 appears for two different queries.

In [26]:
#Define the 3 queries
queries = {
    "Q1": "tanks in Kharkiv",
    "Q2": "Nord Stream pipeline",
    "Q3": "annexation of crimea"
}

In [27]:
def take_evaluation_documents(evaluation_df, query, tweets):
    """
    Parameters
    ----------
    evaluation_df: The evaluation document with the doc_ids, the query_ids and the labels
    query: The query_id we want to evaluate.
    tweets : The tweets sample.

    Returns
    -------
    the doc_ids with their label : dict
    the tweets to be scored : array

    """
    query_evaluation_docs = {}

    for doc_id, query_id, label in zip(evaluation_df["doc"], evaluation_df["query_id"], evaluation_df["label"]):

        #Add the documents that are labeled for that query
        if query_id == query:
            query_evaluation_docs[doc_id] = label

        else:
            #Also add the documents that are relevant to other queries
            if label == 1:
                query_evaluation_docs[doc_id] = 0

    query_evaluation_tweets = [tweet for tweet in tweets if tweet["id"] in query_evaluation_docs]

    return query_evaluation_docs, query_evaluation_tweets

In [28]:
def predicted_scores(query_evaluation_docs, query, index):
    """
    Parameters
    ----------
    query_evaluation_docs: The ids of the documents to be evaluated
    query: The query_id we want to evaluate.
    index : The tf-idf index.

    Returns
    -------
    predictes scores : array

    """
    search_scores = search_tf_idf(queries[query], index)
    #scores = {f"doc_{doc_scores[1]}": doc_scores[0] for doc_scores in search_scores if f"doc_{doc_scores[1]}" in query_evaluation_docs}

    scores = {doc_id: 0.0 for doc_id in query_evaluation_docs}

    for doc_id in query_evaluation_docs:
        for doc_score in search_scores:
            if(f"doc_{doc_score[1]}" == doc_id):
                scores[doc_id] = doc_score[0]
    return scores

In [29]:
# Information need 1: What is the discussion regarding a tank in Kharkiv?
ground_truths_Q1, my_tweets_Q1 = take_evaluation_documents(evaluation_df, "Q1", tweet_sample)
predicted_values_Q1 = predicted_scores(ground_truths_Q1, "Q1", index)
predicted_values_Q1

{'doc_2052': 0.0,
 'doc_164': 0.0,
 'doc_411': 0.0,
 'doc_1805': 0.0,
 'doc_3442': 0.0,
 'doc_2657': 0.0,
 'doc_1534': 0.0,
 'doc_383': 0.0,
 'doc_1618': 0.0,
 'doc_63': 0.0,
 'doc_1452': 0.0,
 'doc_2908': 0.0,
 'doc_618': 0.0,
 'doc_2677': 0.0,
 'doc_489': 0.0,
 'doc_110': 0.0,
 'doc_3439': 0.0,
 'doc_3137': 0.0,
 'doc_3913': 0.0,
 'doc_2696': 0.0,
 'doc_2234': 0.8943486568447452,
 'doc_2656': 0.35355339059327373,
 'doc_3709': 0.40828345545711253,
 'doc_592': 0.6858935777509511,
 'doc_3556': 0.27732727958136394,
 'doc_2586': 0.2722361107568208,
 'doc_2701': 0.7071067811865475,
 'doc_3478': 0.4418003168853549,
 'doc_2956': 0.34294678887547553,
 'doc_3552': 0.26728636328851496,
 'doc_1336': 0.5000659156551265,
 'doc_2295': 0.39230284220229655,
 'doc_2176': 0.4713573803389525,
 'doc_2488': 0.0,
 'doc_1669': 0.0,
 'doc_1953': 0.0,
 'doc_1641': 0.0,
 'doc_637': 0.0,
 'doc_885': 0.0,
 'doc_174': 0.0}

In [30]:
# Information need 2: What discussions are there about the Nord Stream pipeline?
ground_truths_Q2, my_tweets_Q2 = take_evaluation_documents(evaluation_df, "Q2", tweet_sample)
predicted_values_Q2 = predicted_scores(ground_truths_Q2, "Q2", index)
predicted_values_Q2

{'doc_2052': 0.0,
 'doc_164': 0.0,
 'doc_411': 0.0,
 'doc_1805': 0.0,
 'doc_3442': 0.0,
 'doc_2657': 0.0,
 'doc_1534': 0.0,
 'doc_383': 0.0,
 'doc_1618': 0.0,
 'doc_63': 0.0,
 'doc_1452': 1.4852335674903123,
 'doc_2908': 1.698275816821284,
 'doc_618': 1.5666399554460493,
 'doc_2677': 1.299038105676658,
 'doc_489': 1.224733126031953,
 'doc_110': 0.8911401404941874,
 'doc_3439': 0.9259543617263216,
 'doc_3137': 1.0605347094744235,
 'doc_3913': 1.1078196965210538,
 'doc_2696': 0.8911401404941874,
 'doc_1933': 0.0,
 'doc_3326': 0.0,
 'doc_1095': 0.0,
 'doc_2164': 0.0,
 'doc_3146': 0.0,
 'doc_3297': 0.0,
 'doc_243': 0.0,
 'doc_477': 0.0,
 'doc_2457': 0.0,
 'doc_3967': 0.0,
 'doc_2234': 0.0,
 'doc_2656': 0.0,
 'doc_3709': 0.0,
 'doc_592': 0.0,
 'doc_3556': 0.0,
 'doc_2586': 0.0,
 'doc_2701': 0.0,
 'doc_3478': 0.0,
 'doc_2956': 0.0,
 'doc_3552': 0.0}

In [31]:
# Information need 3: What is being said about the annexation of territories by Russia?
ground_truths_Q3, my_tweets_Q3 = take_evaluation_documents(evaluation_df, "Q3", tweet_sample)
predicted_values_Q3 = predicted_scores(ground_truths_Q3, "Q3", index)
predicted_values_Q3

{'doc_2052': 0.37801928522232825,
 'doc_164': 0.40828345545711253,
 'doc_411': 0.39230284220229655,
 'doc_1805': 0.324420591208388,
 'doc_3442': 0.37801928522232825,
 'doc_2657': 0.5345727265770299,
 'doc_1534': 0.3535533905932738,
 'doc_383': 0.39230284220229655,
 'doc_1618': 0.3535533905932738,
 'doc_63': 0.3015103314979439,
 'doc_3803': 0.0,
 'doc_1069': 0.0,
 'doc_2852': 0.0,
 'doc_3445': 0.0,
 'doc_421': 0.0,
 'doc_3478': 0.0,
 'doc_3705': 0.0,
 'doc_3422': 0.0,
 'doc_650': 0.0,
 'doc_2442': 0.0,
 'doc_1452': 0.0,
 'doc_2908': 0.0,
 'doc_618': 0.0,
 'doc_2677': 0.0,
 'doc_489': 0.0,
 'doc_110': 0.0,
 'doc_3439': 0.0,
 'doc_3137': 0.0,
 'doc_3913': 0.0,
 'doc_2696': 0.0,
 'doc_2234': 0.0,
 'doc_2656': 0.0,
 'doc_3709': 0.0,
 'doc_592': 0.0,
 'doc_3556': 0.0,
 'doc_2586': 0.0,
 'doc_2701': 0.0,
 'doc_2956': 0.0,
 'doc_3552': 0.0}

Now we can apply the different metrics

######Precision@K (P@K)

In [32]:
def precision_at_k(doc_score, y_score, k):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    order = np.argsort(y_score)[::-1] #order in descending order
    doc_score = np.take(doc_score, order[:k]) #y_true
    relevant = np.sum(doc_score==1) #get num of relevant documents extracted
    return float(relevant) / k

In [33]:
# Compute Precision@K (P@K) for Q1
print('The precision for query: Tanks in Kharkiv is:\n')
precision_Q1 = precision_at_k(list(ground_truths_Q1.values()), list(predicted_values_Q1.values()), k=10)
print(precision_Q1)

The precision for query: Tanks in Kharkiv is:

0.7


In [34]:
# Compute Precision@K (P@K) for Q2
print('The precision for query: Nord Stream pipeline:\n')
precision_Q2 = precision_at_k(list(ground_truths_Q2.values()), list(predicted_values_Q2.values()), k=10)
print(precision_Q2)

The precision for query: Nord Stream pipeline:

1.0


In [35]:
# Compute Precision@K (P@K) for Q3
print('The precision for query: Annexation of crimea:\n')
precision_Q3 = precision_at_k(list(ground_truths_Q3.values()), list(predicted_values_Q3.values()), k=10)
print(precision_Q3)

The precision for query: Annexation of crimea:

1.0


######Recall@K (R@K)

In [36]:
def recall_at_k(doc_score, y_score, k):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    recall @k : float

    """
    order = np.argsort(y_score)[::-1]
    total_relevant = np.sum(doc_score)
    doc_score = np.take(doc_score, order[:k])  # y_true
    relevant = np.sum(doc_score == 1)
    return float(relevant) / total_relevant

In [37]:
# Compute Recall@K (R@K) for Q1
print('The recall for query: Tanks in Kharkiv is:\n')
recall_Q1 = recall_at_k(list(ground_truths_Q1.values()), list(predicted_values_Q1.values()), k=10)
recall_Q1

The recall for query: Tanks in Kharkiv is:



0.7

In [38]:
# Compute Recall@K (R@K) for Q2
print('The recall for query: Nord Stream pipeline is:\n')
recall_Q2 = recall_at_k(list(ground_truths_Q2.values()), list(predicted_values_Q2.values()), k=10)
recall_Q2

The recall for query: Nord Stream pipeline is:



1.0

In [39]:
# Compute Recall@K (R@K) for Q3
print('The recall for query: Annexation of crimea is:\n')
recall_Q3 = recall_at_k(list(ground_truths_Q3.values()), list(predicted_values_Q3.values()), k=10)
recall_Q3

The recall for query: Annexation of crimea is:



1.0

######Average Precision@K (P@K)

In [40]:
def avg_precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : float
    """
    gtp = np.sum([score for score in doc_score if score == 1])
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    ## if all documents are not relevant
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

In [41]:
# Compute Average Precision@K (P@K) for Q1
print('The average precision at k of query: Tanks in Kharkiv is:\n')
avg_precision_at_k(list(ground_truths_Q1.values()), list(predicted_values_Q1.values()), k=10)

The average precision at k of query: Tanks in Kharkiv is:



0.5747619047619048

In [42]:
# Compute Average Precision@K (P@K) for Q2
print('The average precision at k of query: Nord Stream pipeline is:\n')
avg_precision_at_k(list(ground_truths_Q2.values()), list(predicted_values_Q2.values()), k=10)

The average precision at k of query: Nord Stream pipeline is:



1.0

In [43]:
# Compute Average Precision@K (P@K) for Q3
print('The average precision at k of query: Annexation of crimea is:\n')
avg_precision_at_k(list(ground_truths_Q3.values()), list(predicted_values_Q3.values()), k=10)

The average precision at k of query: Annexation of crimea is:



1.0

#####F1-Score@K

In [44]:
def f1_score(precision, recall):
  '''
    Parameters
    ----------
    precision: precision of query
    recall: recall of query

    Returns
    -------
    f1_score : float
  '''
  mult = 2*precision*recall
  sum = precision + recall

  score = mult/sum

  return score

In [45]:
# Compute F1-Score@K for Q1
print('The f1 score of query: Tanks in Kharkiv is:\n')
f1_score(precision_Q1, recall_Q1)

The f1 score of query: Tanks in Kharkiv is:



0.7

In [46]:
# Compute F1-Score@K for Q2
print('The f1 score of query: Nord Stream pipeline is:\n')
f1_score(precision_Q2, recall_Q2)

The f1 score of query: Nord Stream pipeline is:



1.0

In [47]:
# Compute F1-Score@K for Q3
print('The f1 score of query: Annexation of crimea is:\n')
f1_score(precision_Q3, recall_Q3)

The f1 score of query: Annexation of crimea is:



1.0

#####Mean Average Precision (MAP)

In [48]:
evaluation_df.head()

Unnamed: 0,doc,query_id,label
0,doc_2052,Q3,1
1,doc_164,Q3,1
2,doc_411,Q3,1
3,doc_1805,Q3,1
4,doc_3442,Q3,1


In [49]:
def map_at_k(evaluation_df, queries, index, tweet_sample, k=10):
    """
    Parameters
    ----------
    evaluation_df: search results dataset containing
    queries: list of the queries
    index: the tf-idf index
    tweet_sample: the tweets to use
    k : number of doc to consider

    Returns
    -------
    mean average precision @ k : float
    """
    avp = []
    for q in queries:  # loop over all query id
        current_ground_truths, _ = take_evaluation_documents(evaluation_df, q, tweet_sample)  # select data for current query
        current_scores = predicted_scores(current_ground_truths, q, index)
        avp.append(avg_precision_at_k(list(current_ground_truths.values()), list(current_scores.values()), k))  #append average precision for current query
    return (np.average(avp), avp)  # return mean average precision

In [50]:
# Compute Mean Average Precision (MAP)
map_at_k(evaluation_df, queries, index, tweet_sample, k=10)

(0.8582539682539684, [0.5747619047619048, 1.0, 1.0])

#####Mean Reciprocal Rank (MRR)

In [51]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    Reciprocal Rank for current query
    """

    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    if np.sum(doc_score) == 0:  # if there are not relevant document return 0
        return 0
    return 1 / (np.argmax(doc_score)+1)  # hint: to get the position of the first relevant document use "np.argmax"


In [52]:
def mrr_at_k(evaluation_df, queries, index, tweet_sample, k=10):
    """
    Parameters
    ----------
    evaluation_df: search results dataset containing
    queries: list of the queries
    index: the tf-idf index
    tweet_sample: the tweets to use
    k : number of doc to consider

    Returns
    -------
    mean Reciprocal Rank @ k : float
    """
    avrr = []
    for q in queries:  # loop over all query id
        current_ground_truths, _ = take_evaluation_documents(evaluation_df, q, tweet_sample)  # select data for current query
        current_scores = predicted_scores(current_ground_truths, q, index)
        avrr.append(rr_at_k(list(current_ground_truths.values()), list(current_scores.values()), k))  #append average precision for current query
    return (np.average(avrr), avrr)  # return mean reciprocal rank

In [53]:
mrr_at_k(evaluation_df, queries, index, tweet_sample, k=10)

(1.0, [1.0, 1.0, 1.0])

#####Normalized Discounted Cumulative Gain (NDCG)

In [54]:
def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2**doc_score - 1  # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(doc_score)) + 2) # Compute denominator
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, y_score, k) # Ideal dcg
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4) # return ndcg@k

In [55]:
def mndcg_at_k(evaluation_df, queries, index, tweet_sample, k=10):
    """
    Parameters
    ----------
    evaluation_df: search results dataset containing
    queries: list of the queries
    index: the tf-idf index
    tweet_sample: the tweets to use
    k : number of doc to consider

    Returns
    -------
    mean Normalized Discounted Cumulative Gain @ k : float
    """
    avndcg = []
    for q in queries:  # loop over all query id
        current_ground_truths, _ = take_evaluation_documents(evaluation_df, q, tweet_sample)  # select data for current query
        current_scores = predicted_scores(current_ground_truths, q, index)
        avndcg.append(ndcg_at_k(list(current_ground_truths.values()), list(current_scores.values()), k))  #append average precision for current query
    return (np.average(avndcg), avndcg)  # return mean Normalized Discounted Cumulative Gain

In [56]:
mndcg_at_k(evaluation_df, queries, index, tweet_sample, k=10)

(1.0, [1.0, 1.0, 1.0])

### With our Queries

In [57]:
#Load the evaluation_gt.csv
evaluation_df = pd.read_csv("/content/drive/Shareddrives/IRWA/IRWA_data_2023/Evaluation_gt2.csv")

#Get the ids of the doc in evaluation
sample_ids = evaluation_df["doc"]
print(len(np.array(sample_ids)))
evaluation_df.head()

100


Unnamed: 0,doc,query_id,label
0,doc_3506,Q1,1
1,doc_430,Q1,1
2,doc_3879,Q1,1
3,doc_417,Q1,1
4,doc_287,Q1,1


In [58]:
#Define the queries created previously
queries = {
    "Q1": query1,
    "Q2": query2,
    "Q3": query3,
    "Q4": query4,
    "Q5": query5
}

In [59]:
#Take the tweets sample that appears in the evaluation file
tweet_sample = [tweet for tweet in tweets if tweet['id'] in np.array(sample_ids)]
len(tweet_sample)

90

In [60]:
#First query
ground_truths_Q1, my_tweets_Q1 = take_evaluation_documents(evaluation_df, "Q1", tweet_sample)
predicted_values_Q1 = predicted_scores(ground_truths_Q1, "Q1", index)

for gt, pred in zip(ground_truths_Q1.items(), predicted_values_Q1.values()):
    print(f"Ground truth: {gt}  Predicted value: {pred}")

Ground truth: ('doc_3506', 1)  Predicted value: 0.25
Ground truth: ('doc_430', 1)  Predicted value: 0.3015
Ground truth: ('doc_3879', 1)  Predicted value: 0.1925
Ground truth: ('doc_417', 0)  Predicted value: 0.2582
Ground truth: ('doc_287', 1)  Predicted value: 0.2774
Ground truth: ('doc_411', 1)  Predicted value: 0.2774
Ground truth: ('doc_783', 1)  Predicted value: 0.2085
Ground truth: ('doc_451', 1)  Predicted value: 0.3162
Ground truth: ('doc_3270', 1)  Predicted value: 0.2774
Ground truth: ('doc_2781', 1)  Predicted value: 0.2774
Ground truth: ('doc_61', 0)  Predicted value: 0.0
Ground truth: ('doc_115', 0)  Predicted value: 0.0
Ground truth: ('doc_251', 0)  Predicted value: 0.0
Ground truth: ('doc_499', 0)  Predicted value: 0.0
Ground truth: ('doc_928', 0)  Predicted value: 0.0
Ground truth: ('doc_1972', 0)  Predicted value: 0.0
Ground truth: ('doc_1580', 0)  Predicted value: 0.0
Ground truth: ('doc_2901', 0)  Predicted value: 0.0
Ground truth: ('doc_2574', 0)  Predicted value: 

In [61]:
#Second query
ground_truths_Q2, my_tweets_Q2 = take_evaluation_documents(evaluation_df, "Q2", tweet_sample)
predicted_values_Q2 = predicted_scores(ground_truths_Q2, "Q2", index)

In [62]:
#Third query
ground_truths_Q3, my_tweets_Q3 = take_evaluation_documents(evaluation_df, "Q3", tweet_sample)
predicted_values_Q3 = predicted_scores(ground_truths_Q3, "Q3", index)

In [63]:
#Fourth query
ground_truths_Q4, my_tweets_Q4 = take_evaluation_documents(evaluation_df, "Q4", tweet_sample)
predicted_values_Q4 = predicted_scores(ground_truths_Q4, "Q4", index)

In [64]:
#Fifth query
ground_truths_Q5, my_tweets_Q5 = take_evaluation_documents(evaluation_df, "Q5", tweet_sample)
predicted_values_Q5 = predicted_scores(ground_truths_Q5, "Q5", index)

In [65]:
ground_truths = {
    "Q1": list(ground_truths_Q1.values()),
    "Q2": list(ground_truths_Q2.values()),
    "Q3": list(ground_truths_Q3.values()),
    "Q4": list(ground_truths_Q4.values()),
    "Q5": list(ground_truths_Q5.values())
}

predicted_values = {
    "Q1": list(predicted_values_Q1.values()),
    "Q2": list(predicted_values_Q2.values()),
    "Q3": list(predicted_values_Q3.values()),
    "Q4": list(predicted_values_Q4.values()),
    "Q5": list(predicted_values_Q5.values())
}

#### Metrics

In [66]:
# Compute Precision@K (P@K) for all queries
for k in [3, 5, 10, 20]:
    for q in queries:
        print(f'The precision@{k} for {q} is:')
        precision_current = precision_at_k(ground_truths[q], predicted_values[q], k)
        print(precision_current)
        print()
    print("--"*20)
    print()

The precision@3 for Q1 is:
1.0

The precision@3 for Q2 is:
0.6666666666666666

The precision@3 for Q3 is:
1.0

The precision@3 for Q4 is:
0.3333333333333333

The precision@3 for Q5 is:
1.0

----------------------------------------

The precision@5 for Q1 is:
1.0

The precision@5 for Q2 is:
0.8

The precision@5 for Q3 is:
1.0

The precision@5 for Q4 is:
0.4

The precision@5 for Q5 is:
0.8

----------------------------------------

The precision@10 for Q1 is:
0.9

The precision@10 for Q2 is:
0.8

The precision@10 for Q3 is:
1.0

The precision@10 for Q4 is:
0.7

The precision@10 for Q5 is:
0.4

----------------------------------------

The precision@20 for Q1 is:
0.45

The precision@20 for Q2 is:
0.45

The precision@20 for Q3 is:
0.5

The precision@20 for Q4 is:
0.5

The precision@20 for Q5 is:
0.2

----------------------------------------



In [67]:
# Compute Recall@K (R@K) for all queries
for k in [3, 5, 10, 20]:
    for q in queries:
        print(f'The recall@{k} for {q} is:')
        recall_current = recall_at_k(ground_truths[q], predicted_values[q], k)
        print(recall_current)
        print()
    print("--"*20)
    print()

The recall@3 for Q1 is:
0.3333333333333333

The recall@3 for Q2 is:
0.2

The recall@3 for Q3 is:
0.3

The recall@3 for Q4 is:
0.1

The recall@3 for Q5 is:
0.3

----------------------------------------

The recall@5 for Q1 is:
0.5555555555555556

The recall@5 for Q2 is:
0.4

The recall@5 for Q3 is:
0.5

The recall@5 for Q4 is:
0.2

The recall@5 for Q5 is:
0.4

----------------------------------------

The recall@10 for Q1 is:
1.0

The recall@10 for Q2 is:
0.8

The recall@10 for Q3 is:
1.0

The recall@10 for Q4 is:
0.7

The recall@10 for Q5 is:
0.4

----------------------------------------

The recall@20 for Q1 is:
1.0

The recall@20 for Q2 is:
0.9

The recall@20 for Q3 is:
1.0

The recall@20 for Q4 is:
1.0

The recall@20 for Q5 is:
0.4

----------------------------------------



In [84]:
for k in [3, 5, 10, 20]:
    map = map_at_k(evaluation_df, queries, index, tweet_sample, k)
    for i, q in enumerate(queries):
        print(f'The Average Precision@{k} for {q} is:')
        print(map[1][i])
        print()
    print(f"The Mean Average Precision@{k} is: {map[0]}")
    print()
    print("--"*20)
    print()



The Average Precision@3 for Q1 is:
0.3333333333333333

The Average Precision@3 for Q2 is:
0.16666666666666666

The Average Precision@3 for Q3 is:
0.3

The Average Precision@3 for Q4 is:
0.1

The Average Precision@3 for Q5 is:
0.3

The Mean Average Precision@3 is: 0.24

----------------------------------------

The Average Precision@5 for Q1 is:
0.5555555555555556

The Average Precision@5 for Q2 is:
0.32166666666666666

The Average Precision@5 for Q3 is:
0.5

The Average Precision@5 for Q4 is:
0.13999999999999999

The Average Precision@5 for Q5 is:
0.4

The Mean Average Precision@5 is: 0.3834444444444444

----------------------------------------

The Average Precision@10 for Q1 is:
0.9626543209876544

The Average Precision@10 for Q2 is:
0.6377777777777778

The Average Precision@10 for Q3 is:
1.0

The Average Precision@10 for Q4 is:
0.44630952380952377

The Average Precision@10 for Q5 is:
0.4

The Mean Average Precision@10 is: 0.6893483245149911

----------------------------------------


In [69]:
for k in [3, 5, 10, 20]:
    mrr = mrr_at_k(evaluation_df, queries, index, tweet_sample, k)
    for i, q in enumerate(queries):
        print(f'The Reciprocal Rank@{k} for {q} is:')
        print(mrr[1][i])
        print()
    print(f"The Mean Reciprocal Rank@{k} is: {mrr[0]}")
    print()
    print("--"*20)
    print()


The Reciprocal Rank@3 for Q1 is:
1.0

The Reciprocal Rank@3 for Q2 is:
1.0

The Reciprocal Rank@3 for Q3 is:
1.0

The Reciprocal Rank@3 for Q4 is:
1.0

The Reciprocal Rank@3 for Q5 is:
1.0

The Mean Reciprocal Rank@3 is: 1.0

----------------------------------------

The Reciprocal Rank@5 for Q1 is:
1.0

The Reciprocal Rank@5 for Q2 is:
1.0

The Reciprocal Rank@5 for Q3 is:
1.0

The Reciprocal Rank@5 for Q4 is:
1.0

The Reciprocal Rank@5 for Q5 is:
1.0

The Mean Reciprocal Rank@5 is: 1.0

----------------------------------------

The Reciprocal Rank@10 for Q1 is:
1.0

The Reciprocal Rank@10 for Q2 is:
1.0

The Reciprocal Rank@10 for Q3 is:
1.0

The Reciprocal Rank@10 for Q4 is:
1.0

The Reciprocal Rank@10 for Q5 is:
1.0

The Mean Reciprocal Rank@10 is: 1.0

----------------------------------------

The Reciprocal Rank@20 for Q1 is:
1.0

The Reciprocal Rank@20 for Q2 is:
1.0

The Reciprocal Rank@20 for Q3 is:
1.0

The Reciprocal Rank@20 for Q4 is:
1.0

The Reciprocal Rank@20 for Q5 is:


In [70]:
for k in [3, 5, 10, 20]:
    mndcg = mndcg_at_k(evaluation_df, queries, index, tweet_sample, k)
    for i, q in enumerate(queries):
        print(f'The Normalized Discounted Cumulative Gain@{k} for {q} is:')
        print(mndcg[1][i])
        print()

    print(f"The Mean Normalized Discounted Cumulative Gain@{k} is: {mndcg[0]}")
    print()
    print("--"*20)
    print()


The Normalized Discounted Cumulative Gain@3 for Q1 is:
1.0

The Normalized Discounted Cumulative Gain@3 for Q2 is:
1.0

The Normalized Discounted Cumulative Gain@3 for Q3 is:
1.0

The Normalized Discounted Cumulative Gain@3 for Q4 is:
1.0

The Normalized Discounted Cumulative Gain@3 for Q5 is:
1.0

The Mean Normalized Discounted Cumulative Gain@3 is: 1.0

----------------------------------------

The Normalized Discounted Cumulative Gain@5 for Q1 is:
1.0

The Normalized Discounted Cumulative Gain@5 for Q2 is:
1.0

The Normalized Discounted Cumulative Gain@5 for Q3 is:
1.0

The Normalized Discounted Cumulative Gain@5 for Q4 is:
1.0

The Normalized Discounted Cumulative Gain@5 for Q5 is:
1.0

The Mean Normalized Discounted Cumulative Gain@5 is: 1.0

----------------------------------------

The Normalized Discounted Cumulative Gain@10 for Q1 is:
1.0

The Normalized Discounted Cumulative Gain@10 for Q2 is:
1.0

The Normalized Discounted Cumulative Gain@10 for Q3 is:
1.0

The Normalized Di

### Vector representation

In [71]:
!pip install gensim
!pip install scikit-learn
!pip install matplotlib
!pip install plotly



In [72]:
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

texts = [" ".join(tweet['full_text_tokenized']) for tweet in tweets]
#Create instance of TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
#Transfrom texts into a TF_IDF matrix
tfidf_matrix = vectorizer.fit_transform(texts)

#Create instance of TSNE
tsne = TSNE(n_components=2, random_state=0)
#Reduce data to 2D
X_tsne = tsne.fit_transform(tfidf_matrix.toarray())

import pandas as pd
#Create dataframe witht the reduced dimensions
df = pd.DataFrame(X_tsne, columns=["x", "y"])
#Add column of tokenized tweets
df['text'] = texts

fig = px.scatter(df, x='x', y='y', hover_data=['text'])
fig.show()

In [73]:
from gensim.models import Word2Vec
import numpy as np
import plotly.express as px
import pandas as pd

#Create Word2Vec model
model = Word2Vec(sentences=[tweet['full_text_tokenized'] for tweet in tweets], vector_size=100, window=5, min_count=1, workers=4)

tweet_vectors = []
filtered_tweets = []

for tweet in tweets:
    words = [word for word in tweet['full_text_tokenized'] if word in model.wv.key_to_index]
    if words:
        tweet_vector = np.mean([model.wv[word] for word in words], axis=0)
        tweet_vectors.append(tweet_vector)
        filtered_tweets.append(tweet)

tweet_vectors_np = np.array(tweet_vectors)

tsne_model = TSNE(n_components=2, random_state=0)
reduced_data = tsne_model.fit_transform(tweet_vectors_np)

df = pd.DataFrame(reduced_data, columns=["x", "y"])
df['text'] = [" ".join(tweet['full_text_tokenized']) for tweet in filtered_tweets]

plot = px.scatter(df, x='x', y='y', hover_data=['text'])
plot.show()