# Part 3: Ranking

## Imports

In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import re
import nltk
import contractions
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot 

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/nasar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nasar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nasar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('processed_tweets.csv').drop(columns=['Unnamed: 0'])

### From these results of part 2 we had concluded that some queries could be:

In [3]:
Q1="florida hurrican"

Q2="help people in florida"

Q3="hurrican ian major damages"

Q4="storm impact in Florida"

Q5="floodings in the south"

# a) Scoring using TF-IDF + Cosine Similarity 
Classical scoring, we have also seen during the practical labs

In [4]:
def extract_tweet_vocabulary(tweet, docId):
    return {term: docId for term in tweet.split(' ')}

In [5]:
def merge_dicts(dicts):
    vocab = defaultdict(list)
    for dic in dicts:
        for term in dic:
            vocab[term].append(dic[term])
    return dict(vocab)

In [6]:
tweets_dicts = map(extract_tweet_vocabulary, df['Tweet'], df['DocID'])
vocabulary = merge_dicts(tweets_dicts)
print(f"Vocabulary has {len(vocabulary)} words")

Vocabulary has 10671 words


In [7]:
def build_TF_IDF(df, vocabulary):
    terms = vocabulary.keys()
    docs = df.DocID
    N = len(docs)
    tf_idf = dict()
    
    for doc in docs:
        tf_idf[doc] = {}
    
    for term in terms:
        for doc in vocabulary[term]:
            tf = df[df.DocID == doc].Tweet.iloc[0].split().count(term)
            if tf>0:
                df_i = len(vocabulary[term])
                tf_idf[doc][term] = (1+np.log(tf))*np.log(N/df_i)
            else:
                tf_idf[doc][term] = 0
    return tf_idf

In [8]:
def find_doc2norm(tf_idf):
    docs = tf_idf.keys()
    doc2norm = {}
    for doc in docs:
        doc2norm[doc] = np.linalg.norm(np.array(list(tf_idf[doc].values())))
    return doc2norm

In [9]:
def find_term2sum(tf_idf):
    terms = vocabulary.keys()
    docs = tf_idf.keys()
    term2sum = {term:0 for term in terms}
    for doc in docs:
        for term, value in tf_idf[doc].items():
            term2sum[term] += value
    term2sum = {t: v for t, v in sorted(term2sum.items(), key=lambda item: item[1], reverse=True)}
    return term2sum

In [10]:
tf_idf = build_TF_IDF(df, vocabulary)

In [11]:
doc2norm = find_doc2norm(tf_idf)
term2sum = find_term2sum(tf_idf)

In [12]:
def build_terms(line):
    """
    Preprocess the tweet content removing stop words, contractionas and urls
    lemmatizing and stemming words to keep a single word for each family of words
    transforming in lowercase, removing special characters [#, @, .] 
    (since it is included in another column on the dataframe)
    
    return tokenized tweet (list of words after applying the previous steps).
    
    Argument:
    line -- string (tweet) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
    ## START CODE
    line = line.lower() ##Transform in lowercase
    line = re.sub(r"[^A-Za-z 0-9 ']+", '', line) # remove emojis and any other special character
    stop_words = set(stopwords.words("english")) # removing stopwords
    line = ' '.join([contractions.fix(x) for x in line.split(' ')]) # expaning verb abreviations: i'll -> i will 
    line = re.sub("'", '', line) 
    line = line.split(' ')
    line = [x for x in line if x and x not in stop_words]
    line = filter(lambda x:x[0:5]!='https', line) # removing links
    line = [x for x in line]
    ps = PorterStemmer() 
    lemmatizer = WordNetLemmatizer() 
    line = [lemmatizer.lemmatize(x) for x in line] # keeping the singular form of each noun: feet --> foot
    line = [ps.stem(x) for x in line] # keeping the root of each family of words: dancer --> danc
    
    ## END CODE
    return ' '.join(line)

In [13]:
def rank(query, tf_idf, vocabulary):
    N = len(tf_idf.keys())
    terms_query = build_terms(query).split()
    terms_q = list(set(terms_query))
    tf_idf_q = dict()
    
    for term_q in terms_q:
        f_iq = terms_query.count(term_q)
        if term_q not in vocabulary:
            continue
        df_i = len([doc for doc in vocabulary[term_q] if doc in tf_idf.keys()])
        if df_i>0:
            tf_idf_q[term_q] = (1+np.log(f_iq))*np.log(N/df_i)
        else:
            tf_idf_q[term_q]=0
    q_norm = np.linalg.norm(np.array(list(tf_idf_q.values())))
    doc2score = dict()
    
    doc2norm = find_doc2norm(tf_idf)
    
    for doc, dix in tf_idf.items():
        dot_product = 0
        for term, value in dix.items():
            if term in tf_idf_q.keys():
                dot_product += value * tf_idf_q[term]
        doc2score[doc] = dot_product / (doc2norm[doc] * q_norm)
    doc2score = {t: v for t, v in sorted(doc2score.items(), key=lambda item: item[1], reverse=True)}
    return doc2score

In [14]:
doc2score_Q1 = rank(Q1, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1 ('{Q1}'):")
for i in range(top):
    doc, score = list(doc2score_Q1.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('florida hurrican'):
	 1) doc_640 with a score of 0.5015566349200697 and with the following information:
DocID                                                 doc_640
Tweet                               hurrican ian hurricaneian
Username                                         kadenfields8
Date                           Fri Sep 30 18:07:50 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                       2
Retweets                                                    0
Url         https://twitter.com/kadenfields8/status/157591...
Name: 639, dtype: object
	 2) doc_733 with a score of 0.3285217958369891 and with the following information:
DocID                                                 doc_733
Tweet       experi hurrican check hurricaneian bless hurri...
Username                                         daniel_bautz
Date                           Fri Sep 30 18:04:05 +0000 2022
Hashtags    ['hur

In [15]:
doc2score_Q2 = rank(Q2, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1 ('{Q2}'):")
for i in range(top):
    doc, score = list(doc2score_Q2.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('help people in florida'):
	 1) doc_3682 with a score of 0.3702338713469479 and with the following information:
DocID                                                doc_3682
Tweet       good morn love yesterday employ sent email don...
Username                                         KandiSamples
Date                           Fri Sep 30 14:49:25 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                       3
Retweets                                                    0
Url         https://twitter.com/KandiSamples/status/157586...
Name: 3681, dtype: object
	 2) doc_812 with a score of 0.3457411681394173 and with the following information:
DocID                                                 doc_812
Tweet       look organ provid help hurricaneian victim cen...
Username                                         ChristieZizo
Date                           Fri Sep 30 18:00:53 +0000 2022
Hashtags 

In [16]:
doc2score_Q3 = rank(Q3, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1 ('{Q3}'):")
for i in range(top):
    doc, score = list(doc2score_Q3.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('hurrican ian major damages'):
	 1) doc_1083 with a score of 0.3880462691741326 and with the following information:
DocID                                                doc_1083
Tweet                flood major concern gardenc hurricaneian
Username                                      Annette_Peagler
Date                           Fri Sep 30 17:46:24 +0000 2022
Hashtags                       ['GardenCity', 'HurricaneIan']
Likes                                                       3
Retweets                                                    2
Url         https://twitter.com/Annette_Peagler/status/157...
Name: 1082, dtype: object
	 2) doc_640 with a score of 0.3843417729584876 and with the following information:
DocID                                                 doc_640
Tweet                               hurrican ian hurricaneian
Username                                         kadenfields8
Date                           Fri Sep 30 18:07:50 +0000 2022
Hasht

In [17]:
doc2score_Q4 = rank(Q4, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1 ('{Q4}'):")
for i in range(top):
    doc, score = list(doc2score_Q4.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('storm impact in Florida'):
	 1) doc_2045 with a score of 0.4365974215379948 and with the following information:
DocID                                                doc_2045
Tweet       hurrican ian left devast impact mani part flor...
Username                                        RCMHealthcare
Date                           Fri Sep 30 16:22:18 +0000 2022
Hashtags          ['HurricaneIan', 'HealthCareProfessionals']
Likes                                                       0
Retweets                                                    0
Url         https://twitter.com/RCMHealthcare/status/15758...
Name: 2044, dtype: object
	 2) doc_1065 with a score of 0.35386970430881287 and with the following information:
DocID                                                doc_1065
Tweet       thought neighbor florida impact devast hurrica...
Username                                         launitedways
Date                           Fri Sep 30 17:48:03 +0000 2022
Hashta

In [18]:
doc2score_Q5 = rank(Q5, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1 ('{Q5}'):")
for i in range(top):
    doc, score = list(doc2score_Q5.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('floodings in the south'):
	 1) doc_254 with a score of 0.5127787107942562 and with the following information:
DocID                                                 doc_254
Tweet                             south carolina hurricaneian
Username                                             webgyrl2
Date                           Fri Sep 30 18:26:13 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                       0
Retweets                                                    0
Url         https://twitter.com/webgyrl2/status/1575914929...
Name: 253, dtype: object
	 2) doc_174 with a score of 0.4146748622439571 and with the following information:
DocID                                                 doc_174
Tweet                          south carolina hurricaneian go
Username                                      TheAstuteGaloot
Date                           Fri Sep 30 18:30:20 +0000 2022
Hashtags   

### Take into account that for future queries, the final output must return (when
present) the following information for each of the selected documents: Tweet |
Username | Date | Hashtags | Likes | Retweets | Url (here the “Url” means the
tweet link).

# b) Own score + cosine similarity

Here the task is to create a new score, and it’s up to you to create a new one.


In [44]:
doc2extra_score = dict()

for index, row in df.iterrows():
    extra_score = 0.075 * (row["Likes"] + 2 * row["Retweets"]) / len(row["Hashtags"].split(","))
    doc2extra_score[row["DocID"]] = extra_score

In [45]:
def new_rank(query, tf_idf, vocabulary, doc2extra_score):
    N = len(tf_idf.keys())
    terms_query = build_terms(query).split()
    terms_q = list(set(terms_query))
    tf_idf_q = dict()
    
    for term_q in terms_q:
        f_iq = terms_query.count(term_q)
        if term_q not in vocabulary:
            continue
        df_i = len([doc for doc in vocabulary[term_q] if doc in tf_idf.keys()])
        if df_i>0:
            tf_idf_q[term_q] = (1+np.log(f_iq))*np.log(N/df_i)
        else:
            tf_idf_q[term_q]=0
    q_norm = np.linalg.norm(np.array(list(tf_idf_q.values())))
    doc2score = dict()
    
    doc2norm = find_doc2norm(tf_idf)
    
    for doc, dix in tf_idf.items():
        dot_product = 0
        for term, value in dix.items():
            if term in tf_idf_q.keys():
                dot_product += value * tf_idf_q[term]
        doc2score[doc] = dot_product / (doc2norm[doc] * q_norm) + doc2extra_score[doc] * 0.5
    doc2score = {t: v for t, v in sorted(doc2score.items(), key=lambda item: item[1], reverse=True)}
    return doc2score

In [46]:
doc2score_Q1 = new_rank(Q1, tf_idf, vocabulary, doc2extra_score)
top = 10
print(f"Top {top} docs for query 1 ('{Q1}'):")
for i in range(top):
    doc, score = list(doc2score_Q1.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 1 ('florida hurrican'):
	 1) doc_838 with a score of 73.575 and with the following information:
DocID                                                 doc_838
Tweet       search rescu team work around clock patrol are...
Username                                       GovRonDeSantis
Date                           Fri Sep 30 18:00:17 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                    1326
Retweets                                                  318
Url         https://twitter.com/GovRonDeSantis/status/1575...
Name: 837, dtype: object
	 2) doc_970 with a score of 56.40056599620156 and with the following information:
DocID                                                 doc_970
Tweet       201 republican vote keep govern open disast re...
Username                                             sfpelosi
Date                           Fri Sep 30 17:54:09 +0000 2022
Hashtags                      

In [47]:
doc2score_Q2 = new_rank(Q2, tf_idf, vocabulary, doc2extra_score)
top = 10
print(f"Top {top} docs for query 2 ('{Q2}'):")
for i in range(top):
    doc, score = list(doc2score_Q2.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 2 ('help people in florida'):
	 1) doc_838 with a score of 73.575 and with the following information:
DocID                                                 doc_838
Tweet       search rescu team work around clock patrol are...
Username                                       GovRonDeSantis
Date                           Fri Sep 30 18:00:17 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                    1326
Retweets                                                  318
Url         https://twitter.com/GovRonDeSantis/status/1575...
Name: 837, dtype: object
	 2) doc_970 with a score of 56.43292307906224 and with the following information:
DocID                                                 doc_970
Tweet       201 republican vote keep govern open disast re...
Username                                             sfpelosi
Date                           Fri Sep 30 17:54:09 +0000 2022
Hashtags                

In [48]:
doc2score_Q3 = new_rank(Q3, tf_idf, vocabulary, doc2extra_score)
top = 10
print(f"Top {top} docs for query 3 ('{Q3}'):")
for i in range(top):
    doc, score = list(doc2score_Q3.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 3 ('hurrican ian major damages'):
	 1) doc_838 with a score of 73.575 and with the following information:
DocID                                                 doc_838
Tweet       search rescu team work around clock patrol are...
Username                                       GovRonDeSantis
Date                           Fri Sep 30 18:00:17 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                    1326
Retweets                                                  318
Url         https://twitter.com/GovRonDeSantis/status/1575...
Name: 837, dtype: object
	 2) doc_970 with a score of 56.3625 and with the following information:
DocID                                                 doc_970
Tweet       201 republican vote keep govern open disast re...
Username                                             sfpelosi
Date                           Fri Sep 30 17:54:09 +0000 2022
Hashtags                      

In [49]:
doc2score_Q4 = new_rank(Q4, tf_idf, vocabulary, doc2extra_score)
top = 10
print(f"Top {top} docs for query 4 ('{Q4}'):")
for i in range(top):
    doc, score = list(doc2score_Q4.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 4 ('storm impact in Florida'):
	 1) doc_838 with a score of 73.575 and with the following information:
DocID                                                 doc_838
Tweet       search rescu team work around clock patrol are...
Username                                       GovRonDeSantis
Date                           Fri Sep 30 18:00:17 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                    1326
Retweets                                                  318
Url         https://twitter.com/GovRonDeSantis/status/1575...
Name: 837, dtype: object
	 2) doc_970 with a score of 56.38332018745341 and with the following information:
DocID                                                 doc_970
Tweet       201 republican vote keep govern open disast re...
Username                                             sfpelosi
Date                           Fri Sep 30 17:54:09 +0000 2022
Hashtags               

In [50]:
doc2score_Q5 = new_rank(Q5, tf_idf, vocabulary, doc2extra_score)
top = 10
print(f"Top {top} docs for query 5 ('{Q5}'):")
for i in range(top):
    doc, score = list(doc2score_Q5.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

Top 10 docs for query 5 ('floodings in the south'):
	 1) doc_838 with a score of 73.575 and with the following information:
DocID                                                 doc_838
Tweet       search rescu team work around clock patrol are...
Username                                       GovRonDeSantis
Date                           Fri Sep 30 18:00:17 +0000 2022
Hashtags                                     ['HurricaneIan']
Likes                                                    1326
Retweets                                                  318
Url         https://twitter.com/GovRonDeSantis/status/1575...
Name: 837, dtype: object
	 2) doc_970 with a score of 56.3625 and with the following information:
DocID                                                 doc_970
Tweet       201 republican vote keep govern open disast re...
Username                                             sfpelosi
Date                           Fri Sep 30 17:54:09 +0000 2022
Hashtags                          

# c) BM25

In [51]:
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
a = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}
print(a)

{3: 4, 4: 3, 1: 2, 2: 1, 0: 0}


In [52]:
def BM25(query, vocabulary, k1, b, L_ave, doc_contents):
    '''
    vocabulary = inverted index, dictionary with keys = terms an values = list of doc_ids
    k1 = value to regulate xx
    b = value to regulate yy
    L_ave = average length of docs in words
    doc_contents = dictionary where keys = doc_ids and values = list of terms (after text processing)
    '''
    
    RSV = dict()
    N = len(doc_contents)
    terms_q = list(set(build_terms(query).split()))
    idf = dict()
    
    # calculate idf for each term in the query 
    for t in terms_q:
        f_tq = terms_q.count(t)
        if t not in vocabulary:
            continue
        df_t = len(vocabulary[t])
        idf[t] = np.log(N/df_t)
        
    # calculate RSVd for each document  
    for doc in doc_contents.keys():
        RSV[doc] = 0
        Ld = len(doc_contents[doc])
        for t in idf.keys():
            tf_t_d = doc_contents[doc].count(t)

            second_term = ((k1+1)*tf_t_d) / (k1*((1-b)+b*(Ld/L_ave))+tf_t_d)
            RSV[doc] += idf[t]*second_term
            
    return {k: v for k, v in sorted(RSV.items(), key=lambda item: item[1], reverse=True)}
            

In [53]:
k1 = 1
b = 1
L_ave = np.mean([len(x.split()) for x in df.Tweet])
print(L_ave)
dictionary_doc = df.copy().drop(columns=['Username', 'Date', 'Hashtags', 'Likes', 'Retweets', 'Url'])
dictionary_doc = dictionary_doc.set_index('DocID').T.to_dict('list')
dictionary_doc = {k: x[0].split() for k, x in dictionary_doc.items()}
doc2score_Q5 = BM25(Q5, vocabulary, k1, b, L_ave, dictionary_doc)
top = 10
print(f"Top {top} docs for query ('{Q5}'):")
for i in range(top):
    doc, score = list(doc2score_Q5.items())[i]
    print(f"\t {i+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")

15.4405
Top 10 docs for query ('floodings in the south'):
	 1) doc_1289 with a score of 7.0806154188175645 and with the following information:
DocID                                                doc_1289
Tweet       flood garden citi south carolina gardenc south...
Username                                           skyflyer81
Date                           Fri Sep 30 17:33:51 +0000 2022
Hashtags      ['GardenCity', 'SouthCarolina', 'HurricaneIan']
Likes                                                       1
Retweets                                                    0
Url         https://twitter.com/skyflyer81/status/15759017...
Name: 1288, dtype: object
	 2) doc_2484 with a score of 6.79090713057397 and with the following information:
DocID                                                doc_2484
Tweet       flood trap mani florida ian head south carolin...
Username                                           bettycjung
Date                           Fri Sep 30 15:51:21 +0000 2022
Hash

# 2. Word2vec + cosine similarity

In [54]:
import gensim
from gensim.models import Word2Vec
from numpy import dot
from numpy.linalg import norm

In [55]:
# storing doc words in a list of lists 
docs_as_words = []
for k in dictionary_doc.keys():
    docs_as_words.append(dictionary_doc[k])

In [56]:
# training model for word2vec
model1 = gensim.models.Word2Vec(docs_as_words, min_count = 1,
                              vector_size = 100, window = 5)

In [57]:
# function to convert any text to a vector
def text2vec(text):

    #compute a vector for each word in the text  
    vectors = []
    for w in text:
        vector = model1.wv[w]
        vectors.append(vector)

    # sum all vectors previously computed
    doc_to_vec = np.zeros(len(vectors[0]))
    for v in vectors: 
        doc_to_vec += v

    # take average of previous sum
    text_2_vec = doc_to_vec/len(content) 
        
    return text_2_vec

In [58]:
# converting all tweets to vectors
tweets_to_vecs = {}
for doc, content in dictionary_doc.items():
    tweets_to_vecs[doc] = text2vec(content)

In [59]:
# processing queries
queries = [Q1, Q2, Q3, Q4, Q5]
processed_queries = [build_terms(q) for q in queries]

# removing words that don't appear in our vocabulary 
clean_queries = [[w for w in q.split(' ') if w in vocabulary.keys()] for q in queries]

# converting all queries to vectors
queries_to_vecs = {}
i=0
for q in clean_queries: 
    queries_to_vecs[f'Q{i}'] = text2vec(q)
    i+=1

In [60]:
def cosine_similarity(vec1, vec2):
    cos_sim = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
    return cos_sim

In [61]:
# compute cos similarities for each (query, doc) pair and sort them 
cos_similarities = {}
for query, query_vec in queries_to_vecs.items():
    tweet_queries_sim = {}
    for tweet, tweet_vec in tweets_to_vecs.items():
        tweet_queries_sim[tweet] = cosine_similarity(tweet_vec, query_vec)
        
    tweet_queries_sim = {k: v for k, v in sorted(tweet_queries_sim.items(), key=lambda item: item[1], reverse=True)}
    cos_similarities[query] = tweet_queries_sim

In [62]:
top_20_docs = {}
for query, docs in cos_similarities.items(): 
    i=0
    top_20_docs[query] = []
    for doc in docs.keys():
        if i<20:
            top_20_docs[query].append(doc)
            i+=1
        else: 
            break 
    

In [63]:
print(top_20_docs)

{'Q0': ['doc_1855', 'doc_2317', 'doc_3572', 'doc_733', 'doc_1509', 'doc_485', 'doc_1373', 'doc_111', 'doc_850', 'doc_2409', 'doc_2769', 'doc_134', 'doc_1333', 'doc_2793', 'doc_1891', 'doc_1907', 'doc_464', 'doc_1844', 'doc_1544', 'doc_3493'], 'Q1': ['doc_2817', 'doc_2072', 'doc_3952', 'doc_3619', 'doc_3072', 'doc_263', 'doc_2238', 'doc_1572', 'doc_936', 'doc_1676', 'doc_887', 'doc_2778', 'doc_3964', 'doc_948', 'doc_2276', 'doc_2398', 'doc_2400', 'doc_2403', 'doc_2405', 'doc_2407'], 'Q2': ['doc_1217', 'doc_2140', 'doc_144', 'doc_1963', 'doc_634', 'doc_807', 'doc_640', 'doc_1907', 'doc_1716', 'doc_1933', 'doc_33', 'doc_63', 'doc_205', 'doc_495', 'doc_746', 'doc_1722', 'doc_2786', 'doc_464', 'doc_1719', 'doc_2637'], 'Q3': ['doc_446', 'doc_2045', 'doc_2070', 'doc_611', 'doc_3458', 'doc_2449', 'doc_3061', 'doc_680', 'doc_2326', 'doc_2304', 'doc_2342', 'doc_666', 'doc_3417', 'doc_3461', 'doc_3258', 'doc_923', 'doc_2373', 'doc_1088', 'doc_3459', 'doc_1812'], 'Q4': ['doc_1365', 'doc_3937', 'do

In [64]:
top = 20

i=0
for q in queries:
    print(f"Top {top} docs for query ('{q}'):")
    j=0
    for doc in top_20_docs[f'Q{i}']:
        score = cos_similarities[f'Q{i}'][doc]
        print(f"\t {j+1}) {doc} with a score of {score} and with the following information:\n{df[df.DocID == doc].iloc[0]}")
        j+=1
    i+=1

Top 20 docs for query ('florida hurrican'):
	 1) doc_1855 with a score of 0.999968676279338 and with the following information:
DocID                                                doc_1855
Tweet       hurricaneiancorps florida cemeteri expos caske...
Username                                         TwitWit00000
Date                           Fri Sep 30 16:39:46 +0000 2022
Hashtags                          ['hurricaneian', 'Florida']
Likes                                                       0
Retweets                                                    0
Url         https://twitter.com/TwitWit00000/status/157588...
Name: 1854, dtype: object
	 2) doc_2317 with a score of 0.9999665892411115 and with the following information:
DocID                                                doc_2317
Tweet       raw sewag swirl florida floodwat hurrican ian ...
Username                                                Canoe
Date                           Fri Sep 30 16:00:44 +0000 2022
Hashtags         