# Part 3: Ranking

## Prepare the data & the Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Imports
from collections import defaultdict
from array import array
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import re
import matplotlib.pyplot as plt


nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import json
import csv


#Path of the resulting JSON file of part 1
docs_path = '/content/drive/Shareddrives/IRWA/IRWA_data_2023/Rus_Ukr_war_data_clean.json'
csv_path = '/content/drive/Shareddrives/IRWA/IRWA_data_2023/Rus_Ukr_war_data_ids.csv'

doc_ids = []

#First we will load the ids into a map
with open(csv_path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        doc_id = row[0]
        doc_ids.append(doc_id)

#Then we will load the data
with open(docs_path) as doc_fp:
    tweets = [json.loads(line) for line in doc_fp.readlines()]

#Get the tweet IDs for all tweets
for tweet, doc_id in zip(tweets, doc_ids):
    tweet['id'] = doc_id

tweets[430]


{'full_text_tokenized': ['artilleri',
  'ukrain',
  '#russian',
  '#russia',
  '#ukrain',
  '#ukrainian',
  '#ukrainewar',
  '#ukrainerussiawar',
  '#russiaisaterroristst',
  'https//tco/oyvpwjcxz'],
 'created_at': 1664548988000,
 'hashtags': ['Russians',
  'Russia',
  'Ukraine',
  'Ukrainian',
  'UkraineWar',
  'UkraineRussiaWar',
  'RussiaIsATerroristState'],
 'favorite_count': 26,
 'retweet_count': 4,
 'url': 'https://twitter.com/infussambas/status/1575858789647286272',
 'id': 'doc_431'}

In [None]:
queries = {
    "Q1": "Putin war",
    "Q2": "Ukraine  war",
    "Q3": "Russia Ukraine",
    "Q4": "Russian army",
    "Q5": "Negotiations Trump",
}

In [None]:
def remove_emojis(text):
    """
    Remove all emojis from the given text

    Argument:
    text -- the text to clean

    Returns:
    text - the same text without emojis
    """

    #All emojis are represented in Unicode, so we can remove them by using RE
    #Regular expression found at https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emojis_expression = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

    return re.sub(emojis_expression, "", text)

def preprocess(text):
    """
    Clean and tokenize the text

    Argument:
    text -- the text to clean

    Returns:
    text - the list of tokens obtained from the input text.
    """
    stemmer = PorterStemmer()
    stop_words = stopwords.words("english")
    punctuation = r"[!¡?¿.:,;()\[\]{}\-\'\"]"

    text = remove_emojis(text) #Remove emojis
    text = text.strip() #Remove whitespaces at the beginning and at the end
    text = text.lower() ## Transform in lowercase
    text = re.sub(punctuation, "", text) ## Eliminate puntuation marks
    text = text.split() ## Tokenize the text to get a list of terms
    text = [token for token in text if token not in stop_words] ## Eliminate the stopwords
    text = [stemmer.stem(token) for token in text] ## Perform stemming

    return text

## TF-IDF + cosine similarity ranking

In [None]:
index_path = '/content/drive/Shareddrives/IRWA/IRWA_data_2023/tf-idf_idx.json'
#Load the tf-idf index from the previous practice
with open(index_path) as fp:
    tf_idf_index = json.load(fp)

tf = tf_idf_index["tf"]
idf = tf_idf_index["idf"]
index = tf_idf_index["index"]

In [None]:
print(index)

{'@melsimmonsfcdo': [[1, [0]]], 'wrong': [[1, [1]], [25, [10]], [256, [1, 3]], [327, [6]], [445, [4]], [1322, [4]], [1498, [6]], [1512, [0]], [1596, [6]], [2093, [8]], [2157, [1]], [2207, [8]], [2236, [6]], [2237, [6]], [3714, [4]], [3844, [5]], [3900, [6]], [3947, [8]], [3994, [3]]], 'dictat': [[1, [2]], [199, [2]], [225, [1]], [240, [6]], [666, [9]], [1495, [5]], [1862, [1]], [2674, [16]], [2963, [21]], [3115, [2]]], 'putin': [[1, [3]], [9, [4]], [16, [4, 12]], [22, [2]], [23, [1]], [47, [9]], [50, [11]], [52, [4]], [53, [0]], [56, [1]], [62, [5]], [64, [0, 9]], [73, [0]], [94, [7]], [97, [7]], [99, [3]], [108, [0]], [112, [5]], [118, [2]], [119, [15]], [131, [1]], [133, [0]], [136, [1]], [146, [3]], [152, [1]], [154, [4]], [160, [5, 16]], [163, [11]], [177, [3, 16]], [179, [3, 16]], [183, [5]], [189, [7]], [191, [3]], [194, [2]], [200, [21]], [210, [16]], [224, [5]], [228, [3]], [232, [0]], [240, [7]], [245, [5]], [246, [0]], [247, [4]], [258, [1]], [272, [1]], [281, [4]], [284, [4]

In [None]:
def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies

    Returns:
    doc_scores -- list of scores
    """

    doc_vectors = defaultdict(lambda: [0] * len(terms)) # Call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]=query_terms_count[term]/idf[term] * query_norm

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):

            #tf[term][0] will contain the tf of the term "term" in the doc 26
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]
    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:

    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)

    return doc_scores

In [None]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    doc_scores = rank_documents(query, docs, index, idf, tf)
    return doc_scores

In [None]:
#Search tf-idf of query 1 with union (considers documents that contain at least one term of the query)
ranked_docs1 = search_tf_idf(queries["Q1"], index)
top = 20

print("\n======================\nTop {} results (union based) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q1"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (union based) out of 812 for the query 'Putin war':

tweet id = 1071 - tweet: https://twitter.com/thedogh06218757/status/1575797057650331648 - with score 1.0888030216710458
tweet id = 722 - tweet: https://twitter.com/knittingknots/status/1575823777883394048 - with score 0.8943486568447453
tweet id = 903 - tweet: https://twitter.com/Ukrinform_News/status/1575812377211469824 - with score 0.8527707781109763
tweet id = 3879 - tweet: https://twitter.com/djjarman/status/1575169420661760000 - with score 0.8165669109142253
tweet id = 2990 - tweet: https://twitter.com/toddxz/status/1575349508807933952 - with score 0.8165669109142251
tweet id = 133 - tweet: https://twitter.com/NightwishGreece/status/1575905162195091456 - with score 0.8165669109142251
tweet id = 3974 - tweet: https://twitter.com/LittleLeighXoxo/status/1575157054938955776 - with score 0.7619782674066236
tweet id = 673 - tweet: https://twitter.com/Amannatt/status/1575826327109718016 - with score 0.7560385704446566
t

In [None]:
#Search tf-idf of query 2 with union (considers documents that contain at least one term of the query)
ranked_docs1 = search_tf_idf(queries["Q2"], index)
top = 20

print("\n======================\nTop {} results (union based) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q2"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (union based) out of 1056 for the query 'Ukraine  war':

tweet id = 13 - tweet: https://twitter.com/FreeCiviliansUA/status/1575916461620690944 - with score 1.241820929119815
tweet id = 483 - tweet: https://twitter.com/FreeCiviliansUA/status/1575850767008702464 - with score 1.237719709788933
tweet id = 321 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575880127262904320 - with score 1.1470686204408176
tweet id = 1622 - tweet: https://twitter.com/naveed_a_khan19/status/1575664181419728896 - with score 1.1313708498984762
tweet id = 1503 - tweet: https://twitter.com/KarenBarbraAlex/status/1575692732529790976 - with score 1.0606601717798214
tweet id = 529 - tweet: https://twitter.com/GlobalUA/status/1575843813385723904 - with score 1.050477834130735
tweet id = 1035 - tweet: https://twitter.com/DrACactivism/status/1575799412764585984 - with score 1.0159710232088317
tweet id = 441 - tweet: https://twitter.com/Mesoy640/status/1575857537110315008 - with score 1.000131831310

In [None]:
#Search tf-idf of query 3 with union (considers documents that contain at least one term of the query)
ranked_docs1 = search_tf_idf(queries["Q3"], index)
top = 20

print("\n======================\nTop {} results (union based) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q3"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (union based) out of 1136 for the query 'Russia Ukraine':

tweet id = 13 - tweet: https://twitter.com/FreeCiviliansUA/status/1575916461620690944 - with score 1.241820929119815
tweet id = 483 - tweet: https://twitter.com/FreeCiviliansUA/status/1575850767008702464 - with score 1.237719709788933
tweet id = 568 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575837750321811456 - with score 1.1794541110191612
tweet id = 48 - tweet: https://twitter.com/Shadi_Alkasim/status/1575912174157172736 - with score 1.1767671052506525
tweet id = 321 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575880127262904320 - with score 1.1470686204408176
tweet id = 813 - tweet: https://twitter.com/jonsamuelhood/status/1575818370577932288 - with score 1.1093091183254558
tweet id = 1607 - tweet: https://twitter.com/uhhjesuz/status/1575667412363399168 - with score 1.0954498254141996
tweet id = 3871 - tweet: https://twitter.com/Officialcasas_/status/1575170838667489280 - with score 1.06914545

In [None]:
#Search tf-idf of query 4 with union (considers documents that contain at least one term of the query)
ranked_docs1 = search_tf_idf(queries["Q4"], index)
top = 20

print("\n======================\nTop {} results (union based) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q4"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (union based) out of 890 for the query 'Russian army':

tweet id = 3944 - tweet: https://twitter.com/danish_greek/status/1575160028402954240 - with score 1.4743176387739516
tweet id = 1827 - tweet: https://twitter.com/Javelinzz/status/1575629399151370240 - with score 1.1545639523213949
tweet id = 3216 - tweet: https://twitter.com/InGlore070/status/1575271464021987328 - with score 0.9847169034803861
tweet id = 1984 - tweet: https://twitter.com/InGlore070/status/1575597493508710400 - with score 0.9847169034803861
tweet id = 1153 - tweet: https://twitter.com/AgevRabin/status/1575788212793929728 - with score 0.9561497895204497
tweet id = 2273 - tweet: https://twitter.com/Shadi_Alkasim/status/1575522714441351168 - with score 0.9427147606779053
tweet id = 2264 - tweet: https://twitter.com/Shadi_Alkasim/status/1575524304552398848 - with score 0.9427147606779053
tweet id = 2176 - tweet: https://twitter.com/Shadi_Alkasim/status/1575544454055723008 - with score 0.9427147606779053

In [None]:
#Search tf-idf of query 5 with union (considers documents that contain at least one term of the query)
ranked_docs1 = search_tf_idf(queries["Q5"], index)
top = 20

print("\n======================\nTop {} results (union based) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q5"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (union based) out of 49 for the query 'Negotiations Trump':

tweet id = 3795 - tweet: https://twitter.com/LynxLV/status/1575179206740189184 - with score 0.648841182416776
tweet id = 3089 - tweet: https://twitter.com/RealDonVader1/status/1575315293492203520 - with score 0.565685424949238
tweet id = 1218 - tweet: https://twitter.com/PatilSushmit/status/1575776384324374528 - with score 0.565685424949238
tweet id = 699 - tweet: https://twitter.com/Amannatt/status/1575825005291524096 - with score 0.5345727265770299
tweet id = 515 - tweet: https://twitter.com/u_me_reality/status/1575845160134451200 - with score 0.5345727265770299
tweet id = 1096 - tweet: https://twitter.com/ml_altair/status/1575793925142421504 - with score 0.5000659156551265
tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840 - with score 0.49242916241831175
tweet id = 24 - tweet: https://twitter.com/Shadi_Alkasim/status/1575914711388139520 - with score 0.4082834554571126
tweet id 

##Taking into account the full query


In [None]:
def search_tf_idf_intersection(query, index):
    """
    output is the list of documents that contain all of the query terms.
    So, we will get the list of documents for each query term, and take the intersection of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()

    # Initialize docs with the document IDs containing the first term
    first_term = query[0]
    try:
        # store in term_docs the ids of the docs that contain "term"
        docs = set(posting[0] for posting in index[first_term])
    except:

        return []

    # Iterate through the second term of the query
    for term in query[1:]:
        try:
            term_docs = set(posting[0] for posting in index[term])
            # Take the intersection of docs and term_docs
            docs = docs.intersection(term_docs)
        except:

            return []

    docs = list(docs)
    doc_scores = rank_documents(query, docs, index, idf, tf)
    return doc_scores

In [None]:
#Search tf-idf of query 1 (considering documents that contain both terms of the query - intersection - conjunctive queries)
ranked_docs1 = search_tf_idf_intersection(queries["Q1"], index)
top = 20

print("\n======================\nTop {} results (intersection based, conjunctive queries) out of {} for the query '{}':\n".format(top, len(ranked_docs1), queries["Q1"]))
for score, d_id in ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (intersection based, conjunctive queries) out of 41 for the query 'Putin war':

tweet id = 1071 - tweet: https://twitter.com/thedogh06218757/status/1575797057650331648 - with score 1.0888030216710458
tweet id = 722 - tweet: https://twitter.com/knittingknots/status/1575823777883394048 - with score 0.8943486568447453
tweet id = 903 - tweet: https://twitter.com/Ukrinform_News/status/1575812377211469824 - with score 0.8527707781109763
tweet id = 3879 - tweet: https://twitter.com/djjarman/status/1575169420661760000 - with score 0.8165669109142253
tweet id = 2990 - tweet: https://twitter.com/toddxz/status/1575349508807933952 - with score 0.8165669109142251
tweet id = 133 - tweet: https://twitter.com/NightwishGreece/status/1575905162195091456 - with score 0.8165669109142251
tweet id = 673 - tweet: https://twitter.com/Amannatt/status/1575826327109718016 - with score 0.7560385704446566
tweet id = 99 - tweet: https://twitter.com/knittingknots/status/1575908078352039936 - with sco

In [None]:
#Search tf-idf of query 2 (considering documents that contain both terms of the query - intersection - conjunctive queries)
ranked_docs2 = search_tf_idf_intersection(queries["Q2"], index)
top = 20

print("\n======================\nTop {} results (intersection based, conjunctive queries) out of {} for the query '{}':\n".format(top, len(ranked_docs2), queries["Q2"]))
for score, d_id in ranked_docs2[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (intersection based, conjunctive queries) out of 182 for the query 'Ukraine  war':

tweet id = 1622 - tweet: https://twitter.com/naveed_a_khan19/status/1575664181419728896 - with score 1.1313708498984762
tweet id = 1503 - tweet: https://twitter.com/KarenBarbraAlex/status/1575692732529790976 - with score 1.0606601717798214
tweet id = 1035 - tweet: https://twitter.com/DrACactivism/status/1575799412764585984 - with score 1.0159710232088317
tweet id = 441 - tweet: https://twitter.com/Mesoy640/status/1575857537110315008 - with score 1.000131831310253
tweet id = 2792 - tweet: https://twitter.com/BlogUkraine/status/1575404760701833216 - with score 0.9999904099540156
tweet id = 1384 - tweet: https://twitter.com/BlogUkraine/status/1575743364573450240 - with score 0.9999904099540156
tweet id = 567 - tweet: https://twitter.com/knittingknots/status/1575837833268715520 - with score 0.9999904099540156
tweet id = 88 - tweet: https://twitter.com/TCherrn/status/1575908646126747648 - wit

In [None]:
#Search tf-idf of query 3 (considering documents that contain both terms of the query - intersection - conjunctive queries)
ranked_docs3 = search_tf_idf_intersection(queries["Q3"], index)
top = 20

print("\n======================\nTop {} results (intersection based, conjunctive queries) out of {} for the query '{}':\n".format(top, len(ranked_docs3), queries["Q3"]))
for score, d_id in ranked_docs3[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (intersection based, conjunctive queries) out of 194 for the query 'Russia Ukraine':

tweet id = 568 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575837750321811456 - with score 1.1794541110191612
tweet id = 48 - tweet: https://twitter.com/Shadi_Alkasim/status/1575912174157172736 - with score 1.1767671052506525
tweet id = 813 - tweet: https://twitter.com/jonsamuelhood/status/1575818370577932288 - with score 1.1093091183254558
tweet id = 1607 - tweet: https://twitter.com/uhhjesuz/status/1575667412363399168 - with score 1.0954498254141996
tweet id = 3871 - tweet: https://twitter.com/Officialcasas_/status/1575170838667489280 - with score 1.0691454531540598
tweet id = 1478 - tweet: https://twitter.com/Transctimes/status/1575702428032196608 - with score 1.028981787982664
tweet id = 3063 - tweet: https://twitter.com/NEWS_ALL_TIME/status/1575326053219946496 - with score 1.000131831310253
tweet id = 1601 - tweet: https://twitter.com/angeltveit/status/1575668478375124992 -

In [None]:
#Search tf-idf of query 4 (considering documents that contain both terms of the query - intersection - conjunctive queries)
ranked_docs4 = search_tf_idf_intersection(queries["Q4"], index)
top = 20

print("\n======================\nTop {} results (intersection based, conjunctive queries) out of {} for the query '{}':\n".format(top, len(ranked_docs4), queries["Q4"]))
for score, d_id in ranked_docs4[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (intersection based, conjunctive queries) out of 71 for the query 'Russian army':

tweet id = 3944 - tweet: https://twitter.com/danish_greek/status/1575160028402954240 - with score 1.4743176387739516
tweet id = 1827 - tweet: https://twitter.com/Javelinzz/status/1575629399151370240 - with score 1.1545639523213949
tweet id = 3216 - tweet: https://twitter.com/InGlore070/status/1575271464021987328 - with score 0.9847169034803861
tweet id = 1984 - tweet: https://twitter.com/InGlore070/status/1575597493508710400 - with score 0.9847169034803861
tweet id = 1153 - tweet: https://twitter.com/AgevRabin/status/1575788212793929728 - with score 0.9561497895204497
tweet id = 2273 - tweet: https://twitter.com/Shadi_Alkasim/status/1575522714441351168 - with score 0.9427147606779053
tweet id = 2264 - tweet: https://twitter.com/Shadi_Alkasim/status/1575524304552398848 - with score 0.9427147606779053
tweet id = 2176 - tweet: https://twitter.com/Shadi_Alkasim/status/1575544454055723008 - wi

In [None]:
#Search tf-idf of query 5 (considering documents that contain both terms of the query - intersection - conjunctive queries)
ranked_docs4 = search_tf_idf_intersection(queries["Q5"], index)
top = 20

print("\n======================\nTop {} results (intersection based, conjunctive queries) out of {} for the query '{}':\n".format(top, len(ranked_docs4), queries["Q5"]))
for score, d_id in ranked_docs4[:top]:
        print("tweet id = {} - tweet: {} - with score {}".format(d_id, tweets[d_id-1]["url"], score))


Top 20 results (intersection based, conjunctive queries) out of 2 for the query 'Negotiations Trump':

tweet id = 3795 - tweet: https://twitter.com/LynxLV/status/1575179206740189184 - with score 0.648841182416776
tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840 - with score 0.49242916241831175


## Our score + cosine similarity ranking

In [None]:
def our_rank_documents_likes(terms, docs, index, idf, tf, tweets):
    """
    Perform the ranking of the results of a search based on the tf-idf weights and number of likes

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies

    Returns:
    doc_scores -- list of scores
    """
    pre_scores = rank_documents(terms, docs, index, idf, tf)
    new_scores = []
    new_scores = [[tweets[id-1]["favorite_count"] * score, id]for score, id in pre_scores]

    new_scores.sort(reverse=True)
    return new_scores

In [None]:
def search_tf_idf_likes(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()

    # Initialize docs with the document IDs containing the first term
    first_term = query[0]
    try:
        # store in term_docs the ids of the docs that contain "term"
        docs = set(posting[0] for posting in index[first_term])
    except:
        return []

    # Iterate through the second term of the query
    for term in query[1:]:
        try:
            term_docs = set(posting[0] for posting in index[term])
            # Take the intersection of docs and term_docs
            docs = docs.intersection(term_docs)
        except:
            # Term is not in the index
            return []

    docs = list(docs)
    doc_scores = our_rank_documents_likes(query, docs, index, idf, tf, tweets)
    return doc_scores

In [None]:
#Search tf-idf-likes of query 1
our_ranked_docs1 = search_tf_idf_likes(queries["Q1"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q1"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], score))

[[17.96758330995017, 1856], [10.584539986225192, 673], [6.949445445501389, 633], [5.969395446776835, 903], [4.355777772109133, 531], [4.221144640971215, 2724], [4.040973833124882, 3392], [3.0988247578719266, 2072], [2.5297452203729924, 189], [2.449700732742676, 3879], [2.262741699796952, 3629], [1.9465235472503282, 1648], [1.697056274847714, 3089], [1.6037181797310898, 240], [1.2060413259917755, 3204], [0.6858935777509512, 2333], [0.6858935777509512, 194], [0.6171627986196186, 1862], [0.5345727265770299, 3331], [0.5164707929786544, 400], [0.5000659156551265, 247], [0.4850752518939717, 22], [0.0, 3901], [0.0, 3693], [0.0, 3608], [0.0, 3544], [0.0, 3422], [0.0, 2990], [0.0, 2963], [0.0, 2696], [0.0, 2693], [0.0, 2008], [0.0, 1949], [0.0, 1348], [0.0, 1213], [0.0, 1100], [0.0, 1071], [0.0, 722], [0.0, 517], [0.0, 133], [0.0, 99]]

Top 20 results out of 41 for the query 'Putin war':

tweet id = 1856 - tweet: https://twitter.com/aeberman12/status/1575624308101038080 - likes 33 - with score 

In [None]:
#Search tf-idf-likes of query 2
our_ranked_docs1 = search_tf_idf_likes(queries["Q2"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q2"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], score))

[[28.459633729196163, 1646], [28.389488736502464, 2532], [16.20264478410855, 3094], [16.08498221571911, 2715], [13.393592385166873, 375], [13.277485451696041, 3491], [12.999875329402203, 1384], [9.999904099540156, 2792], [9.454300507176615, 2864], [8.982236020056478, 3828], [8.167083322704626, 3337], [7.405953583435425, 3092], [6.949445445501389, 633], [6.714120308722507, 288], [6.171627986196188, 2009], [6.030206629958879, 2124], [4.427054135652737, 869], [4.355777772109134, 531], [4.242640687119286, 1540], [3.770859042711621, 3497], [3.3941125496954285, 1622], [3.2668333290818503, 1473], [3.2074363594621795, 2115], [3.0241542817786264, 3333], [2.9197853208754925, 1648], [2.722361107568209, 3323], [2.722361107568209, 2482], [2.5297452203729924, 189], [2.3538170532137794, 3305], [2.2627416997969525, 3629], [1.8973089152797442, 2766], [1.6970562748477143, 3089], [1.6331338218284506, 3301], [1.5757167511961025, 1994], [1.5757167511961025, 942], [1.5494123789359633, 1536], [1.512077140889

In [None]:
#Search tf-idf-likes of query 3
our_ranked_docs1 = search_tf_idf_likes(queries["Q3"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q3"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], score))

[[220.72027047754358, 2318], [108.6206525570529, 2467], [62.66606569045163, 2721], [38.18376618407357, 1343], [31.61530987227549, 418], [28.459633729196163, 1646], [24.768536331402387, 2504], [21.631527807346387, 2205], [21.335391487385465, 3013], [19.613727896552454, 490], [19.460992831816164, 2803], [17.51871192525295, 3764], [16.08498221571911, 2715], [15.810907627331202, 1709], [14.034231128921883, 1419], [13.393592385166873, 375], [13.333205466053538, 403], [10.615086999172451, 1800], [9.8746047779139, 470], [8.221671966212226, 130], [7.2028725158786475, 1478], [6.949445445501389, 633], [6.6558547099527345, 813], [5.421953376782209, 111], [5.1166246686658585, 1689], [4.427054135652737, 869], [4.355777772109134, 531], [3.9392918779902564, 505], [3.9224627365980163, 3251], [3.4110831124439054, 2339], [3.2074363594621795, 2115], [3.1621815254662407, 239], [3.151433502392205, 1375], [2.722361107568209, 3323], [2.595364729667104, 1867], [2.595364729667104, 1685], [2.3091279046427897, 2

In [None]:
#Search tf-idf-likes of query 4
our_ranked_docs1 = search_tf_idf_likes(queries["Q4"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q4"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], score))

[[25.32686784582729, 3553], [15.87680997933779, 884], [14.55225755681915, 1484], [13.333205466053538, 1015], [7.622611101190984, 527], [7.210509269115462, 1183], [6.693048526643148, 1153], [6.48841182416776, 2406], [4.666621913118739, 3775], [3.770859042711621, 2264], [3.463691856964185, 1207], [3.4110831124439054, 3055], [3.4110831124439054, 368], [2.921199534437865, 1073], [2.82842712474619, 3375], [2.82842712474619, 3229], [2.828144282033716, 2273], [2.7732727958136394, 3394], [2.595364729667104, 2823], [2.4120826519835514, 49], [2.121320343559643, 494], [2.121320343559643, 20], [1.8854295213558105, 2176], [1.6639636774881836, 1928], [1.6331338218284506, 3709], [1.5239565348132476, 3140], [1.4743176387739516, 3944], [1.2343255972392375, 3707], [1.1545639523213949, 1827], [1.131370849898476, 1988], [1.0889444430272834, 1970], [1.050477834130735, 3541], [0.9847169034803861, 1984], [0.9427147606779053, 2163], [0.8943486568447454, 2396], [0.8527707781109763, 2758], [0.8527707781109763, 

In [None]:
#Search tf-idf-likes of query 5
our_ranked_docs1 = search_tf_idf_likes(queries["Q5"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q5"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], score))

[[21.174453983987405, 2214], [1.9465235472503282, 3795]]

Top 20 results out of 2 for the query 'Negotiations Trump':

tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840 - likes 43 - with score 21.174453983987405
tweet id = 3795 - tweet: https://twitter.com/LynxLV/status/1575179206740189184 - likes 3 - with score 1.9465235472503282


In [None]:
def our_rank_documents_rt(terms, docs, index, idf, tf, tweets):
    """
    Perform the ranking of the results of a search based on the tf-idf weights and number of likes

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies

    Returns:
    doc_scores -- list of scores
    """
    pre_scores = rank_documents(terms, docs, index, idf, tf)
    new_scores = []
    new_scores = [[tweets[id-1]["retweet_count"] * score, id]for score, id in pre_scores]

    new_scores.sort(reverse=True)
    return new_scores

In [None]:
def search_tf_idf_rt(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()

    # Initialize docs with the document IDs containing the first term
    first_term = query[0]
    try:
        # store in term_docs the ids of the docs that contain "term"
        docs = set(posting[0] for posting in index[first_term])
    except:
        return []

    # Iterate through the second term of the query
    for term in query[1:]:
        try:
            term_docs = set(posting[0] for posting in index[term])
            # Take the intersection of docs and term_docs
            docs = docs.intersection(term_docs)
        except:
            # Term is not in the index
            return []

    docs = list(docs)
    doc_scores = our_rank_documents_rt(query, docs, index, idf, tf, tweets)
    return doc_scores

In [None]:
#Search tf-id-rtw of query 1
our_ranked_docs1 = search_tf_idf_rt(queries["Q1"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q1"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["retweet_count"], score))

[[3.26683332908185, 1856], [2.1778888860545664, 531], [1.8090619889876631, 2724], [1.5494123789359633, 2072], [1.3717871555019023, 2333], [0.8527707781109763, 903], [0.648841182416776, 1648], [0.6324363050932481, 189], [0.6030206629958877, 3204], [0.5772819761606974, 3392], [0.565685424949238, 3629], [0.5345727265770299, 633], [0.0, 3901], [0.0, 3879], [0.0, 3693], [0.0, 3608], [0.0, 3544], [0.0, 3422], [0.0, 3331], [0.0, 3089], [0.0, 2990], [0.0, 2963], [0.0, 2696], [0.0, 2693], [0.0, 2008], [0.0, 1949], [0.0, 1862], [0.0, 1348], [0.0, 1213], [0.0, 1100], [0.0, 1071], [0.0, 722], [0.0, 673], [0.0, 517], [0.0, 400], [0.0, 247], [0.0, 240], [0.0, 194], [0.0, 133], [0.0, 99], [0.0, 22]]

Top 20 results out of 41 for the query 'Putin war':

tweet id = 1856 - tweet: https://twitter.com/aeberman12/status/1575624308101038080 - rt 6 - with score 3.26683332908185
tweet id = 531 - tweet: https://twitter.com/NikkeiAsia/status/1575843574301790208 - rt 4 - with score 2.1778888860545664
tweet id = 

In [None]:
#Search tf-id-rtw of query 2
our_ranked_docs1 = search_tf_idf_rt(queries["Q2"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q3"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["retweet_count"], score))

[[10.233249337331717, 3094], [8.65922964241046, 3491], [6.101200150790007, 2715], [3.9999616398160622, 2792], [3.615295550850581, 288], [3.266267643656901, 3828], [3.085813993098094, 2532], [2.9999712298620467, 1384], [2.6261945853268376, 2864], [2.5297452203729924, 1646], [2.468651194478475, 3092], [2.363575126794154, 375], [2.177888886054567, 531], [2.121320343559643, 1540], [1.8973089152797442, 2766], [1.8090619889876636, 2124], [1.5120771408893132, 3333], [1.3717871555019023, 2333], [1.2343255972392375, 2009], [1.0889444430272834, 2482], [1.0889444430272834, 1473], [1.0691454531540598, 2115], [0.9732617736251641, 1648], [0.7846056844045931, 1604], [0.7560385704446566, 2292], [0.6858935777509512, 1465], [0.6324363050932481, 1105], [0.6324363050932481, 189], [0.6030206629958879, 3204], [0.5897270555095807, 3441], [0.5656854249492381, 3629], [0.5546545591627279, 3921], [0.5546545591627279, 2463], [0.5444722215136417, 3337], [0.5345727265770299, 2397], [0.5345727265770299, 633], [0.492

In [None]:
#Search tf-id-rtw of query 3
our_ranked_docs1 = search_tf_idf_rt(queries["Q3"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q3"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["retweet_count"], score))

[[32.092465528220174, 2467], [30.989378949569158, 2318], [16.970562748477143, 1343], [8.221671966212226, 1709], [6.101200150790007, 2715], [6.101200150790007, 2205], [5.546545591627279, 813], [5.333282186421416, 403], [5.307543499586226, 2803], [5.170506205392273, 1419], [4.115927151930656, 1478], [3.538362333057484, 2504], [3.24420591208388, 3764], [3.24420591208388, 1867], [2.666641093210708, 2721], [2.5583123343329293, 2339], [2.5583123343329293, 1689], [2.5399275580220793, 3013], [2.5297452203729924, 1646], [2.363575126794154, 375], [2.3589082220383224, 1800], [2.177888886054567, 531], [1.769181166528742, 2358], [1.7055415562219527, 490], [1.697056274847714, 2960], [1.6639636774881836, 1644], [1.6639636774881836, 418], [1.3717871555019023, 2384], [1.1794541110191612, 575], [1.0691454531540598, 2115], [1.050477834130735, 1375], [1.0329415859573088, 3621], [1.000131831310253, 3063], [0.9732617736251641, 2545], [0.8943486568447452, 141], [0.8165669109142253, 3271], [0.784605684404593,

In [None]:
#Search tf-id-rtw of query 4
our_ranked_docs1 = search_tf_idf_rt(queries["Q4"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q4"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["retweet_count"], score))

[[4.824165303967103, 3553], [3.780192852223283, 884], [2.722361107568209, 527], [2.2186182366509115, 1183], [1.7055415562219527, 3055], [1.455225755681915, 1484], [1.333320546605354, 3775], [1.333320546605354, 1015], [1.297682364833552, 2406], [1.131370849898476, 3375], [1.131370849898476, 3229], [1.0889444430272834, 2035], [1.0159710232088317, 3140], [0.9427147606779053, 2264], [0.8527707781109763, 368], [0.7302998836094663, 1073], [0.7071067811865476, 20], [0.5772819761606974, 3335], [0.5546545591627279, 1928], [0.5444722215136417, 1970], [0.5252389170653675, 2967], [0.0, 3985], [0.0, 3944], [0.0, 3885], [0.0, 3709], [0.0, 3707], [0.0, 3568], [0.0, 3556], [0.0, 3541], [0.0, 3474], [0.0, 3394], [0.0, 3347], [0.0, 3291], [0.0, 3216], [0.0, 3071], [0.0, 2831], [0.0, 2823], [0.0, 2795], [0.0, 2758], [0.0, 2692], [0.0, 2598], [0.0, 2586], [0.0, 2435], [0.0, 2410], [0.0, 2396], [0.0, 2381], [0.0, 2340], [0.0, 2319], [0.0, 2273], [0.0, 2176], [0.0, 2163], [0.0, 1988], [0.0, 1984], [0.0, 184

In [None]:
#Search tf-id-rtw of query 5
our_ranked_docs1 = search_tf_idf_rt(queries["Q5"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q5"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["retweet_count"], score))

[[9.356154085947923, 2214], [0.648841182416776, 3795]]

Top 20 results out of 2 for the query 'Negotiations Trump':

tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840 - rt 19 - with score 9.356154085947923
tweet id = 3795 - tweet: https://twitter.com/LynxLV/status/1575179206740189184 - rt 1 - with score 0.648841182416776


In [None]:
LIKES_FACTOR = 0.1

def our_rank_documents_mixed(terms, docs, index, idf, tf, tweets):
    """
    Perform the ranking of the results of a search based on the tf-idf weights and number of likes

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies

    Returns:
    doc_scores -- list of scores
    """
    pre_scores = rank_documents(terms, docs, index, idf, tf)
    new_scores = []
    new_scores = [[((tweets[id-1]["favorite_count"]*LIKES_FACTOR) + (tweets[id-1]["retweet_count"]*(1-LIKES_FACTOR))) * score, id]for score, id in pre_scores]

    new_scores.sort(reverse=True)
    return new_scores

In [None]:
def search_tf_idf_mixed(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.

    Argument:
    query -- name of the query
    index -- created index

    Returns:
    doc_scores -- list of scores
    """
    query = preprocess(query)
    docs = set()

    # Initialize docs with the document IDs containing the first term
    first_term = query[0]
    try:
        # store in term_docs the ids of the docs that contain "term"
        docs = set(posting[0] for posting in index[first_term])
    except:
        return []

    # Iterate through the second term of the query
    for term in query[1:]:
        try:
            term_docs = set(posting[0] for posting in index[term])
            # Take the intersection of docs and term_docs
            docs = docs.intersection(term_docs)
        except:
            # Term is not in the index
            return []

    docs = list(docs)
    doc_scores = our_rank_documents_mixed(query, docs, index, idf, tf, tweets)
    return doc_scores

In [None]:
#Search tf-idf-likes-rtw of query 1
our_ranked_docs1 = search_tf_idf_mixed(queries["Q1"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q1"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], tweets[d_id-1]["retweet_count"], score))

[[4.736908327168683, 1856], [2.3956777746600233, 531], [2.0502702541860187, 2724], [1.7043536168295597, 2072], [1.3644332449775622, 903], [1.3031977977268072, 2333], [1.176059998469466, 633], [1.0584539986225194, 673], [0.9236511618571159, 3392], [0.8221671966212226, 189], [0.7786094189001314, 1648], [0.7353910524340095, 3629], [0.6633227292954765, 3204], [0.24497007327426762, 3879], [0.16970562748477144, 3089], [0.160371817973109, 240], [0.06858935777509512, 194], [0.06171627986196187, 1862], [0.053457272657702996, 3331], [0.05164707929786544, 400], [0.050006591565512654, 247], [0.048507525189397174, 22], [0.0, 3901], [0.0, 3693], [0.0, 3608], [0.0, 3544], [0.0, 3422], [0.0, 2990], [0.0, 2963], [0.0, 2696], [0.0, 2693], [0.0, 2008], [0.0, 1949], [0.0, 1348], [0.0, 1213], [0.0, 1100], [0.0, 1071], [0.0, 722], [0.0, 517], [0.0, 133], [0.0, 99]]

Top 20 results out of 41 for the query 'Putin war':

tweet id = 1856 - tweet: https://twitter.com/aeberman12/status/1575624308101038080 - likes

In [None]:
#Search tf-idf-likes-rtw of query 2
our_ranked_docs1 = search_tf_idf_mixed(queries["Q2"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q2"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], tweets[d_id-1]["retweet_count"], score))

[[10.8301888820094, 3094], [9.12105522333902, 3491], [7.099578357282917, 2715], [5.6161814674385315, 2532], [5.1227340712553096, 1646], [4.5999558857884715, 2792], [3.9999616398160622, 1384], [3.925178026637773, 288], [3.837864481296859, 3828], [3.466576852631426, 375], [3.3090051775118154, 2864], [2.9623814333741705, 3092], [2.3956777746600237, 531], [2.333452377915607, 1540], [2.2311764530847853, 2124], [1.8973089152797442, 2766], [1.7280558361349325, 2009], [1.6632848549782446, 3333], [1.3067333316327403, 1473], [1.30673333163274, 3337], [1.3031977977268072, 2333], [1.282974543784872, 2115], [1.2522861094813758, 2482], [1.176059998469466, 633], [1.167914128350197, 1648], [0.8221671966212226, 189], [0.7846056844045931, 1604], [0.7560385704446566, 2292], [0.7353910524340096, 3629], [0.6858935777509512, 1465], [0.6633227292954766, 3204], [0.6324363050932481, 1105], [0.6101200150790007, 3921], [0.5897270555095807, 3441], [0.5345727265770299, 2397], [0.4991891032464551, 2463], [0.4924291

In [None]:
#Search tf-idf-likes-rtw of query 3
our_ranked_docs1 = search_tf_idf_mixed(queries["Q3"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q3"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], tweets[d_id-1]["retweet_count"], score))

[[49.9624681023666, 2318], [39.745284231103454, 2467], [19.091883092036785, 1343], [8.980595532324124, 1709], [8.666583552934801, 2721], [7.654232916445645, 2205], [7.099578357282917, 2715], [6.72288843280922, 2803], [6.1332745143846275, 403], [6.056878697745233, 1419], [5.661379732891975, 2504], [5.657476503459824, 813], [5.1227340712553096, 1646], [4.671656513400787, 3764], [4.659098296966914, 418], [4.4246216883254545, 1478], [4.419473950958418, 3013], [3.4963601902550034, 490], [3.466576852631426, 375], [3.1845260997517353, 1800], [3.179321793842203, 1867], [2.814143567766222, 1689], [2.643589412144027, 2339], [2.3956777746600237, 531], [1.697056274847714, 2960], [1.680722108202305, 2358], [1.608498221571911, 1644], [1.542906996549047, 470], [1.391359871205146, 130], [1.282974543784872, 2115], [1.2605734009568823, 1375], [1.234608439951712, 2384], [1.1794541110191612, 575], [1.176059998469466, 633], [1.1362357445530398, 3621], [1.000131831310253, 3063], [0.9732617736251641, 2545], 

In [None]:
#Search tf-idf-likes-rtw of query 4
our_ranked_docs1 = search_tf_idf_mixed(queries["Q4"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q4"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], tweets[d_id-1]["retweet_count"], score))

[[6.874435558153122, 3553], [4.989854564934733, 884], [3.212386106930486, 527], [2.7649289357956386, 1484], [2.717807339897367, 1183], [2.5333090385501724, 1015], [1.8760957118441481, 3055], [1.8167553107669727, 2406], [1.6666506832566923, 3775], [1.3010764773832473, 3375], [1.3010764773832473, 3229], [1.225529188881277, 2264], [1.1086020115442692, 368], [1.0667695743692733, 3140], [1.0344972208759193, 2035], [0.9493898486923062, 1073], [0.8485281374238572, 20], [0.6693048526643148, 1153], [0.6655854709952735, 1928], [0.5989194436650059, 1970], [0.5772819761606974, 3335], [0.5252389170653675, 2967], [0.3463691856964185, 1207], [0.2828144282033716, 2273], [0.27732727958136394, 3394], [0.2595364729667104, 2823], [0.24120826519835514, 49], [0.2121320343559643, 494], [0.18854295213558106, 2176], [0.16331338218284508, 3709], [0.14743176387739518, 3944], [0.12343255972392375, 3707], [0.11545639523213949, 1827], [0.1131370849898476, 1988], [0.10504778341307351, 3541], [0.09847169034803861, 19

In [None]:
#Search tf-idf-likes-rtw of query 5
our_ranked_docs1 = search_tf_idf_mixed(queries["Q5"], index)
top = 20
print(our_ranked_docs1)
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q5"]))
for score, d_id in our_ranked_docs1[:top]:
        print("tweet id = {} - tweet: {} - likes {} - rt {} - with score {}".format(d_id, tweets[d_id-1]["url"], tweets[d_id-1]["favorite_count"], tweets[d_id-1]["retweet_count"], score))

[[10.537984075751872, 2214], [0.7786094189001314, 3795]]

Top 20 results out of 2 for the query 'Negotiations Trump':

tweet id = 2214 - tweet: https://twitter.com/potuspl/status/1575534709144227840 - likes 43 - rt 19 - with score 10.537984075751872
tweet id = 3795 - tweet: https://twitter.com/LynxLV/status/1575179206740189184 - likes 3 - rt 1 - with score 0.7786094189001314


## Word2vec + cosine similarity ranking

In [None]:
from gensim.models import Word2Vec
import numpy as np
import plotly.express as px
import pandas as pd

#Create Word2Vec model
model = Word2Vec(sentences=[tweet['full_text_tokenized'] for tweet in tweets], vector_size=100, window=5, min_count=1, workers=4)

tweet_vectors = []
filtered_tweets = []

#Iterate troguh each tweet
for tweet in tweets:
    words = [word for word in tweet['full_text_tokenized'] if word in model.wv.key_to_index]
    #Check if the word exits in the model created
    if words:
        tweet_vector = np.mean([model.wv[word] for word in words], axis=0)
        tweet_vector /= la.norm(tweet_vector)
        tweet_vectors.append(tweet_vector)
        filtered_tweets.append(tweet)

#Transform into an array
tweet_vectors_np = np.array(tweet_vectors)



In [None]:
from numpy import dot
from numpy.linalg import norm
def rank_documents_word2vec(query_vector, tweet_vectors):
    """
    Rank documents based on cosine similarity with the query vector.

    Arguments:
    query_vector -- vector representation of the query
    tweet_vectors -- list of tweet vectors

    Returns:
    ranked_indices -- list of tuples containing score and index of ranked tweets
    """
    #Calculate the cosine similarity of each tweet
    cos_similarities = [(dot(query_vector, tweet_vec) / (norm(query_vector) * norm(tweet_vec)), i)
                        for i, tweet_vec in enumerate(tweet_vectors)]
    #Sort them in descending order
    ranked_indices = sorted(cos_similarities, key=lambda x: x[0], reverse=True)
    return ranked_indices


In [None]:
def search_word2vec(query, model, tweet_vectors, tweets):
    """
    Search for documents that contain the query terms using Word2Vec representations.

    Arguments:
    query -- the search query
    model -- Word2Vec model
    tweet_vectors -- list of tweet vectors
    tweets -- list of original tweets

    Returns:
    filtered_ranked_tweet_indices -- list of indices of ranked tweets based on the query
    """
    query_preprocessed = preprocess(query)
    query_words = set(word for word in query_preprocessed if word in model.wv.key_to_index)

    relevant_tweet_indices = set()
    for word in query_words:
        relevant_tweet_indices.update([i for i, tweet in enumerate(tweets) if word in tweet['full_text_tokenized']])
    if not relevant_tweet_indices:
        return []

    query_vector = np.mean([model.wv[word] for word in query_words], axis=0)
    query_vector /= norm(query_vector)
    relevant_tweet_vectors = [tweet_vectors[i] for i in relevant_tweet_indices]

    ranked_tweet_indices = rank_documents_word2vec(query_vector, relevant_tweet_vectors)

    filtered_ranked_tweet_indices = [(score, list(relevant_tweet_indices)[index]) for score, index in ranked_tweet_indices]
    return filtered_ranked_tweet_indices


In [None]:
#Search tf-idf-word2vec of query 1
our_ranked_docs1 = search_word2vec(queries["Q1"], model, tweet_vectors_np, tweets)
top = 20
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q1"]))
for score, idx in our_ranked_docs1[:top]:
    tweet = tweets[idx]
    print("tweet id = {} - tweet: {} - rt {} - with score {} - text {}".format(tweet['id'], tweet["url"], tweet["retweet_count"], score, tweet["full_text_tokenized"]))


Top 20 results out of 812 for the query 'Putin war':

tweet id = doc_3693 - tweet: https://twitter.com/deplorable_im/status/1575189180543475712 - rt 0 - with score 0.999835729598999 - text ['#ukrainerussiawar', 'pro', 'russian', 'war', 'began', 'saw', 'possibl', 'nuclear', 'war', 'russia', 'us', 'got', 'involv', 'ukrain', 'reason', 'would', 'involv', 'us', 'inth', 'wari', 'influenc', 'putin', 'warn', 'use', 'nuke', 'take', 'serious']
tweet id = doc_129 - tweet: https://twitter.com/JohnHarpur1/status/1575905281275330560 - rt 0 - with score 0.999795138835907 - text ['avoid', 'nuclear', 'war', 'everyon', 'top', 'prioriti', 'right', 'now#ukrainerussiawar']
tweet id = doc_2546 - tweet: https://twitter.com/dml_lawrence/status/1575462082321207296 - rt 0 - with score 0.9997915625572205 - text ['seem', 'usa', 'take', 'everyth', 'grantedthey', 'stop', 'war', 'want', 'escalate#ukrainerussiawar']
tweet id = doc_198 - tweet: https://twitter.com/kk4dice/status/1575898165542195200 - rt 0 - with scor

In [None]:
#Search tf-idf-word2vec of query 2
our_ranked_docs1 = search_word2vec(queries["Q2"], model, tweet_vectors_np, tweets)
top = 20
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q2"]))
for score, idx in our_ranked_docs1[:top]:
    tweet = tweets[idx]
    print("tweet id = {} - tweet: {} - rt {} - with score {} - text {}".format(tweet['id'], tweet["url"], tweet["retweet_count"], score, tweet["full_text_tokenized"]))



Top 20 results out of 1056 for the query 'Ukraine  war':

tweet id = doc_1386 - tweet: https://twitter.com/ndtv/status/1575743037996572672 - rt 5 - with score 0.9989551901817322 - text ['#newsalert', '|', '23', 'kill', 'russian', 'attack', 'humanitarian', 'convoy', 'ukrain', 'governor', '|', 'report', 'news', 'agenc', 'afp', '#ukrainerussiawar']
tweet id = doc_2721 - tweet: https://twitter.com/ndtv/status/1575428857573695488 - rt 4 - with score 0.9989348649978638 - text ['#ukrainerussiawar', '|', 'russia', 'formal', 'annex', 'four', 'ukrain', 'occupi', 'region', 'friday', 'kremlin', '|', 'report', 'news', 'agenc', 'afp']
tweet id = doc_704 - tweet: https://twitter.com/ndtv/status/1575824764215533568 - rt 2 - with score 0.9988915920257568 - text ['#justin', '|', 'russian', 'presid', 'vladimir', 'putin', 'begin', 'speech', 'kremlin', 'ceremoni', 'annex', 'ukrain', 'region', '|', 'report', 'news', 'agenc', 'afp', '#ukrainerussiawar']
tweet id = doc_3434 - tweet: https://twitter.com/Ukrai

In [None]:
#Search tf-idf-word2vec of query 3
our_ranked_docs1 = search_word2vec(queries["Q3"], model, tweet_vectors_np, tweets)
top = 20
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q3"]))
for score, idx in our_ranked_docs1[:top]:
    tweet = tweets[idx]
    print("tweet id = {} - tweet: {} - rt {} - with score {} - text {}".format(tweet['id'], tweet["url"], tweet["retweet_count"], score, tweet["full_text_tokenized"]))



Top 20 results out of 1136 for the query 'Russia Ukraine':

tweet id = doc_2317 - tweet: https://twitter.com/OlenaStarynets/status/1575510064533684224 - rt 0 - with score 0.9995338320732117 - text ['glori', 'ukrain', '#ukrainewillwin#ukrainewar#ukrainerussiawar#ukrainianarmy#mariupol#standwithukraine#ukrainianpow#ukraine#україна#ukrainewarnews#armukrainenow#crimea#зсу#khersonisukraine#kharkiv', 'togeth', '4', 'victori', 'https//tco/sciltqibtc']
tweet id = doc_2301 - tweet: https://twitter.com/OlenaStarynets/status/1575514037651021824 - rt 0 - with score 0.9995317459106445 - text ['glori', 'ukrain', '#ukrainewillwin#ukrainewar#ukrainerussiawar#ukrainianarmy#mariupol#standwithukraine#ukrainianpow#ukraine#україна#ukrainewarnews#armukrainenow#crimea#зсу#khersonisukraine#kharkiv', 'togeth', '4', 'victori', 'https//tco/okrncnivzk']
tweet id = doc_2338 - tweet: https://twitter.com/OlenaStarynets/status/1575504934547513344 - rt 0 - with score 0.9995260834693909 - text ['glori', 'ukrain', '#uk

In [None]:
#Search tf-idf-word2vec of query 4
our_ranked_docs1 = search_word2vec(queries["Q4"], model, tweet_vectors_np, tweets)
top = 20
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q4"]))
for score, idx in our_ranked_docs1[:top]:
    tweet = tweets[idx]
    print("tweet id = {} - tweet: {} - rt {} - with score {} - text {}".format(tweet['id'], tweet["url"], tweet["retweet_count"], score, tweet["full_text_tokenized"]))


Top 20 results out of 890 for the query 'Russian army':

tweet id = doc_73 - tweet: https://twitter.com/deijos1/status/1575909544622186496 - rt 0 - with score 0.9994915723800659 - text ['putin', 'tri', 'get', 'forc', 'beat', 'turn', 'ukrainian', 'make', 'ukrainian', 'occupi', 'land', 'tri', 'fool', 'russian', 'conscript', 'behav', 'like', 'defend', 'area', 'like', 'dombasal', 'fair#ukrainerussiawar']
tweet id = doc_3243 - tweet: https://twitter.com/crypticvalentin/status/1575261242142461952 - rt 0 - with score 0.999482274055481 - text ['ukrain', 'said', 'wednesday', 'russianstag', 'vote', 'four', 'ukrainian', 'region', 'becom', 'part', 'russia', 'null', 'worthless', 'kyiv', 'would', 'press', 'effort', 'liber', 'ukrainian', 'territori', 'occupi', 'russian', 'forces#ukrainerussiawar']
tweet id = doc_2145 - tweet: https://twitter.com/crypticvalentin/status/1575550410089238528 - rt 0 - with score 0.99753737449646 - text ['#trump', 'ask', 'head', 'group', 'negoti', 'putin', 'respons', 'nor

In [None]:
#Search tf-idf-word2vec of query 5
our_ranked_docs1 = search_word2vec(queries["Q5"], model, tweet_vectors_np, tweets)
top = 20
print("\n======================\nTop {} results out of {} for the query '{}':\n".format(top, len(our_ranked_docs1), queries["Q5"]))
for score, idx in our_ranked_docs1[:top]:
    tweet = tweets[idx]
    print("tweet id = {} - tweet: {} - rt {} - with score {} - text {}".format(tweet['id'], tweet["url"], tweet["retweet_count"], score, tweet["full_text_tokenized"]))


Top 20 results out of 49 for the query 'Negotiations Trump':

tweet id = doc_2145 - tweet: https://twitter.com/crypticvalentin/status/1575550410089238528 - rt 0 - with score 0.9993818402290344 - text ['#trump', 'ask', 'head', 'group', 'negoti', 'putin', 'respons', 'nord', 'stream', 'leak', 'former', 'presid', 'claim', 'russian', 'invas', 'ukrain', 'would', 'happen', 'white', 'house#ukrainerussiawar']
tweet id = doc_2908 - tweet: https://twitter.com/abduAlbadii/status/1575369493957595136 - rt 0 - with score 0.9978381395339966 - text ['sabotag', 'caus', 'destruct', 'string', 'nord', 'stream', 'nord', 'stream', '2', 'ga', 'pipelin', 'could', 'lead', 'escal', 'even', 'war', 'former', 'us', 'presid', 'donald', 'trump', '#ukrainerussiawar']
tweet id = doc_3878 - tweet: https://twitter.com/TimeToWakeUp247/status/1575169694822445056 - rt 0 - with score 0.9975317120552063 - text ['@tulsigabbard', 'everyth', 'say', 'absolut', 'correct', 'need', 'polit', 'leader', 'like', '@tulsigabbard', 'need'