In [1]:
# mount the drive for latter importing the datasets
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#**1. Import dictionaries**

In [2]:
# download nltk and the stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# import the dictionaries
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import json
import re
import csv

#**2. Load the datasets**

In [4]:
docs_path = 'drive/MyDrive/IRWA/Part_1:Text_Processing/Hurricane_Ian_Corpus/data/tw_hurricane_data.json'
tweets = []
# open the JSON file
with open(docs_path) as fp:
    for jsonObj in fp:
        tweetsDict = json.loads(jsonObj)
        tweets.append(tweetsDict) # add the tweets in our array tweets

In [5]:
docs_path_2 = 'drive/MyDrive/IRWA/Part_1:Text_Processing/Hurricane_Ian_Corpus/data/tweet_document_ids_map.csv'
doc_id = {}
# open the CSV file
with open(docs_path_2, newline='') as csvfile:
  spamreader = csv.reader(csvfile, delimiter=' ', quotechar=' ')
  for row in spamreader:
    doc_id[row[0].split()[1]] = row[0].split()[0] # add the doc number as an entry of our dictionary, having the tweet id as the key of this entry

#**3. Text Processing**

In [6]:
def build_terms(tweet):
    """
    Preprocess the text of the tweet by eliminating the url, the people labelled with the @,
    eliminating the punctuation, separating the words after the hashtag, removing stop words, 
    stemming, transforming in lowercase and returning the tokens of the text.
    
    Argument:
    tweet -- string (text) to be pre-processed
    
    Returns:
    tweet - a list of tokens corresponding to the input text after the pre-processing
    """

    stemmer = PorterStemmer() # stemm the words to get the root of the word and avoid having different words that mean the same
    stop_words = set(stopwords.words("english")) # eliminate all the stop words to make efficient queries and documents
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # separate the words without including puntuation marks
    
    tweet = re.sub(r'http\S+', '', tweet) ## delete the url
    tweet = re.sub(r'@\S+', '', tweet) ## delete the word after @ (so the people labelled)
    tweet = " ".join([a for a in re.split('([A-Z][a-z]+)', tweet) if a]) ## separate the hashtags in words according to the capital letters
    tweet = tweet.replace("_", " ") ## eliminate the _ (it is the only punctuation mark that is not deleted with tokenize)
    tweet = tweet.lower() ## transform in lowercase
    tweet = tokenizer.tokenize(tweet) ## tokenize the text to get a list of terms and remove punctuation marks
    tweet=[i for i in tweet if i not in stop_words]  ## eliminate the stopwords
    tweet=[stemmer.stem(i) for i in tweet] ## perform stemming

    return tweet

#**4. Build the inverted index and the ranking of tf-idf**

In [7]:
def create_index_tf_idf(tweets, num_tweets):
  """
  Create the inverted index, the tweets dictionary and compute the tf, df and idf

  Argument:
  tweets -- collection of tweets
  num_tweets -- total number of tweets
  
  Returns:
  index -- the inverted index. Contains the terms as keys and in which tweets (appear as the document number related to the tweet id)
  and in which position inside this tweet appears each term
  tf -- normalized term frequency for each term in each tweet
  df -- number of document each term appear in
  idf -- inverse document frequency of each term
  tweets-index -- the tweet's dictionary. Contains an entry for each tweet which key is the document number related with
  the tweet's id. Each tweet has its text, username, date, hashtags, number of likes, number of retweets and url if they exist 
  """
  index = defaultdict(list) # Create the inverted index
  tweets_index = defaultdict(str) # Create the tweets dictionary
  tf = defaultdict(list)  # Create the term frequency dictionary
  df = defaultdict(int)  # Create the tweet frequency dictionary
  idf = defaultdict(float) # Create the inverse tweet frequency dictionary
  
  for tweet in tweets:
    # for each tweet we create a dictionary containing the text, username, date, hashtags, number of likes, number of retweets and url if they exist
    tweet_dict = {}
    try:
      tweet_dict["text"] = tweet["full_text"]
    except:
      pass
    try:
      tweet_dict["username"] = tweet['user']['screen_name']
    except:
      pass
    try:
      tweet_dict["date"] = tweet["created_at"]
    except:
      pass
    try:
      tweet_dict["hashtags"] = []
      for i in range(0, len(tweet["entities"]["hashtags"])):
        tweet_dict["hashtags"].append(tweet["entities"]["hashtags"][i])
    except:
      pass
    try:
      tweet_dict["likes"] = tweet["favorite_count"]
    except:
      pass
    try:
      tweet_dict["retweets"] = tweet["retweet_count"]
    except:
      pass
    try:
      tweet_dict["url"] = tweet["entities"]["media"][0]["url"]
    except:
      pass

    terms = build_terms(tweet["full_text"]) # call build terms for processing the text of the tweet

    try:
      tweet_dict["normalised_text"] = terms  # save the processed terms in the dictionary
    except:
      pass

    tweets_index[doc_id[str(tweet["id"])]] = tweet_dict # save the tweet in tweets index by the document number related with the tweet id

    current_page_index = {}

    for position, term in enumerate(terms): # loop over all terms
        try:
            # if the term is already in the index for the current page append the position
            current_page_index[term][1].append(position)
        except:
            # else add the new term as dict key and set the document number corresponding to this tweet and the position where the term appears in this tweet
            current_page_index[term]=[doc_id[str(tweet["id"])], array('I',[position])] #'I' indicates unsigned int (int in Python)

    # calculate the denominator to normalize term frequencies for not letting the size of the document matters the ranking
    norm = 0
    for term, posting in current_page_index.items():
        norm += len(posting[1]) ** 2
    norm = math.sqrt(norm)

    # compute the tf (so the frequency of a term in the document) and the df (number of documents containing a certain term)
    for term, posting in current_page_index.items():
        tf[term].append(np.round(len(posting[1])/norm,4))
        df[term] += 1
    
    # compute the idf (for giving more importance to the terms that appear in less documents)
    for term in df:
        idf[term] = np.round(np.log(float(num_tweets/df[term])), 4)

    # merge the current page index with the main index
    for term_page, posting_page in current_page_index.items():
        index[term_page].append(posting_page)

  return index, tf, df, idf, tweets_index

In [8]:
start_time = time.time() # we save the current time for calculating how much time will take to run our create_index_tfidf() function
num_tweets = len(tweets)
index, tf, df, idf, tweets_index = create_index_tf_idf(tweets, num_tweets) # we call the function
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2))) # we calculate the current time less the start time for getting the total time that our function lasted

Total time to create the index: 213.04 seconds


In [9]:
def rank_documents_tf_idf(terms, docs, index, idf, tf, tweets_index):
    """
    Rank the results of a query based on the tf-idf weights that we have previously calculated
    
    Argument:
    terms -- list of terms
    tweets -- list of tweets to rank matching the query
    index -- inverted index
    idf -- inverted document frequencies
    tf -- term frequencies
    tweets_index -- mapping between document number and tweet id
    
    Returns:
    result_docs -- list in order of ranked documents
    """

    # We are only interested on the elements of the docVector corresponding to the query term, due to this, the others will become 0 once that they are multiplied by the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values())) # compute the norm for the query tf

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf
        query_vector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):      
            if doc in docs: 
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    # Calculate the score of each doc 
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True) # sort the doc_scores
    
    result_docs = [x[1] for x in doc_scores]

    # print no results found if no results are found, ask to write another query and call again the search_docs() function
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_docs(query, index, "tf_idf")
    #print ('\n'.join(result_docs), '\n')
    return result_docs

#**5. Build the ranking of our own model**

In [10]:
def create_our_own_model(tweets):
  """
  Create the model that we will use as a second ranking

  Argument:
  tweets -- collection of tweets
  
  Returns:
  own_model -- a dictionary with the score based on the number of tweets, retweets and followers that each document has and that will be used for ranking queries
  """
  own_model = {}
  max_value = 0

  for tweet in tweets:
    # calculate the score for this ranking for each document. We have decided to use like and retweet as recommended. Also, we have decided
    # to give less importance to the number of likes, as for a person it is easier to give a like than a retweet
    own_model[doc_id[str(tweet["id"])]] = tweet["retweet_count"] + 0.5 * tweet["favorite_count"]

    if max_value < tweet["retweet_count"] + 1/2 * tweet["favorite_count"]:
      max_value = tweet["retweet_count"] + 1/2 * tweet["favorite_count"] # keep the max value
  
  for doc in own_model:
    own_model[doc] = own_model[doc]/max_value # divide each score by the max value for having a normalised final score between 0 and 1

  return own_model

In [11]:
own_model = create_our_own_model(tweets) # we call the function and receive our weights

In [12]:
def rank_documents_own_model(terms, docs, index, tf, idf, own_model, tweets_index):
    """
    Rank the results of a query based on the weights of our model and of tf-idf that we have previously calculated
    
    Argument:
    terms -- list of terms
    docs -- list of tweets to rank matching the query
    index -- inverted index
    tf -- term frequencies
    idf -- inverted document frequencies
    own_model -- the weights of our model
    tweets_index -- mapping between document number and tweet id
    
    Returns:
    result_docs -- list in order of ranked documents
    """

    # We are only interested on the elements of the docVector corresponding to the query term, due to this, the others will become 0 once that they are multiplied by the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values())) # compute the norm for the query tf

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf
        query_vector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):      
            if doc in docs: 
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]

    # Calculate the score of each doc 
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ] 

    doc_scores.sort(reverse=True) # sort the doc_scores
    
    # add to the tf-idf score the score of our model being a number between 0 and half of the higher value of the tf-idf score (so we are giving more importance to the tf-idf score)
    for i in range(0, len(doc_scores)):
      doc_scores[i][0] += 0.5 * own_model[doc_scores[i][1]] * doc_scores[0][0]

    doc_scores.sort(reverse=True) # sort again the doc_scores
    
    result_docs = [x[1] for x in doc_scores]

    # print no results found if no results are found, ask to write another query and call again the search_docs() function
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_docs(query, index, "own_model")
    #print ('\n'.join(result_docs), '\n')
    return result_docs

#**6. Build the ranking of BM25 model**

In [13]:
counter = 0
average_length = 0

# iterate over all tweets
for doc_name in tweets_index:
  average_length += len(tweets_index[doc_name]["normalised_text"]) # sum the length of each tweet
  counter += 1

average_length = average_length/counter # calculate the average size of all tweets
print("The average length of all tweets is:", average_length)

The average length of all tweets is: 16.94225


In [14]:
def rank_documents_bm25(terms, docs, index, idf, tf, tweets_index, average_length, k1=1.2, b=0.75): # we use k1=1.2 and b=0.75 as experts say that those values work better for k1 and b
    """
    Rank the results of a query based on BM25
    
    Argument:
    terms -- list of terms
    docs -- list of tweets to rank matching the query
    index -- inverted index
    idf -- inverted document frequencies
    tf -- term frequencies
    tweets_index -- mapping between document number and tweet id
    average_length -- average length of all tweets
    k1 --  tuning parameter controlling the document term frequency scaling
    b -- tuning parameter controlling the scaling by document length
    
    Returns:
    result_docs -- list in order of ranked documents
    """

    # create the doc_vectors dictionary
    doc_vectors = {}

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        # generate bm25 for each term of the query
        for doc_index, (doc, postings) in enumerate(index[term]):      
            if doc in docs:
              # we use the short version of bm25 as the tweets are short text
              try:
                # use this if doc_vectors already has a value (so if we have already calculated the bm25 score for at least 1 term of the query)
                doc_vectors[doc] += float(idf[term] * (((k1 + 1) * tf[term][doc_index])/(k1 * ((1 - b) + b * (len(tweets_index[doc]["normalised_text"])/average_length)) + tf[term][doc_index])))
              except:
                # use this if doc_vectors is empty (so if we do not have calculated the bm25 score for any terms of the query yet)
                doc_vectors[doc] = float(idf[term] * (((k1 + 1) * tf[term][doc_index])/(k1 * ((1 - b) + b * (len(tweets_index[doc]["normalised_text"])/average_length)) + tf[term][doc_index])))

    # save the result as the weight and the doc number 
    doc_scores=[[doc_vectors[doc], doc] for doc in doc_vectors ]
    doc_scores.sort(reverse=True) # sort the doc_scores
    
    result_docs = [x[1] for x in doc_scores]

    # print no results found if no results are found, ask to write another query and call again the search_docs() function
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_docs(query, index, "bm25")
    #print ('\n'.join(result_docs), '\n')
    return result_docs

#**7. Build the ranking of Word2Vec model**

In [15]:
def create_word_2_vec_model(tweets_index):
  """
  Create the word2vec model

  Argument:
  tweets_index -- dictionary containing information of tweets
  
  Returns:
  word2vec_model -- word2vec model
  """

  terms_of_tweets = []

  # create an array containing all the tweets text pre-processed
  for i in tweets_index:
    terms_of_tweets.append(tweets_index[i]["normalised_text"])

  word2vec_model = Word2Vec(terms_of_tweets, workers=4, size=100, min_count=50, window=10, sample=1e-3) # create the word2vec model from that array

  return word2vec_model

In [16]:
word2vec_model = create_word_2_vec_model(tweets_index) # call the function and receive the word2vec model 



In [17]:
def rank_documents_word_2_vec(terms, docs, index, word2vec_model, tweets_index):
    """
    Rank the results of a query based on the word2vec weights
    
    Argument:
    terms -- list of terms
    docs -- list of tweets to rank matching the query
    index -- inverted index
    word2vec_model -- word2vec model containing the weights of each term
    tweets_index -- mapping between document number and tweet id
    
    Returns:
    result_docs -- list in order of ranked documents
    """

    # create the doc_vectors dictionary
    doc_vectors = {}

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        # calculate the average of the word2vec model for our query
        try:
          query_vector[0]
          query_vector_2 = word2vec_model.wv[term]

          # if query vector is not empty (so if we have already saved in query_vector the weight for some word2vec value) we sum each value of the vector plus the 
          # value of the same position of the vector that is already in query_vector
          for i in range(0, len(query_vector)):
            query_vector[i] += query_vector_2[i]

        except:
          # otherwise we just save the array in query vector converting it with np.array() as otherwise it cannot be edited
          query_vector = np.array(word2vec_model.wv[term])

        # we repeat the same as with the query for each document
        for doc_index, (doc, postings) in enumerate(index[term]):      
            if doc in docs:
              for normalised_term in tweets_index[doc]["normalised_text"]:
                try:
                  doc_vectors[doc]
                  doc_vectors_2 = word2vec_model.wv[term]

                  for i in range(0, len(doc_vectors)):
                    doc_vectors[doc][i] += doc_vectors_2[i]

                except:
                  doc_vectors[doc] = np.array(word2vec_model.wv[term])

    # normalize each value of the weight by dividing by the total number of terms
    for i in range(0, len(query_vector)):
      query_vector[i] = query_vector[i] / len(terms)

    # normalize each value of the weight by dividing by the total number of terms
    for doc in doc_vectors:
      for j in range(0, len(doc_vectors[doc])):
        doc_vectors[doc][j] = doc_vectors[doc][j] / len(tweets_index[doc]["normalised_text"])
    
    # Calculate the score of each doc by applying the dot product
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()] 

    doc_scores.sort(reverse=True) # sort the doc_scores

    result_docs = [x[1] for x in doc_scores]

    # print no results found if no results are found, ask to write another query and call again the search_docs() function
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_docs(query, index, "word2vec")
    #print ('\n'.join(result_docs), '\n')
    return result_docs

#**8. Create the function for selecting the top 20 docs based on our rankings**

In [18]:
def search_docs(query, index, ranking_method):
    """
    Obtain the list of ranked documents based on the query
    
    Argument:
    query -- list of terms that we want to search
    index -- inverted index
    ranking_method -- ranking method that will be used for getting the top documents based on a query
    
    Returns:
    ranked_docs -- list in order of ranked documents
    """
    query = build_terms(query) # normalize the query
    docs = set() # create an empty set where we will store the ordered docs for each word in the query
    counter = 0 # create a counter for distinguishing between the first term and the rest of them
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain the current term                    
            term_docs=[posting[0] for posting in index[term]]
            
            # keep the documents that have all terms of the query
            if counter == 0:
              # we have to distinguish for the first term, because otherwise, the intersection between a empty set and another set is always an empty set
              docs = docs.union(term_docs)
              counter += 1
            else:
              docs = docs.intersection(term_docs)
        except:
            # pass if the term is not in the index
            pass
    docs = list(docs)

    # use the correspondent ranking method for ranking only the docs that keep in docs, so only the docs that contain all the terms in the query
    if ranking_method == "tf_idf":
      ranked_docs = rank_documents_tf_idf(query, docs, index, idf, tf, tweets_index)
    elif ranking_method == "own_model":
      ranked_docs = rank_documents_own_model(query, docs, index, tf, idf, own_model, tweets_index)
    elif ranking_method == "bm25":
      ranked_docs = rank_documents_bm25(query, docs, index, idf, tf, tweets_index, average_length)
    else:
      ranked_docs = rank_documents_word_2_vec(query, docs, index, word2vec_model, tweets_index)

    return ranked_docs # return the ranked docs

#**9. Select the queries to use**

In [19]:
maximum = 0
maximum_2 = 0
maximum_3 = 0
maximum_4 = 0
max_word_tf = ""
second_max_word_tf = ""
third_max_word_tf = ""
fourth_max_word_tf = ""
# find the four documents with higher tf
for i in tf:
  if i != "hurricaneian": # do not have into account the term hurricaneian
    if sum(tf[i]) > maximum:
      maximum_4 = maximum_3
      fourth_max_word_tf = third_max_word_tf
      maximum_3 = maximum_2
      third_max_word_tf = second_max_word_tf
      maximum_2 = maximum
      second_max_word_tf = max_word_tf
      maximum = sum(tf[i])
      max_word_tf = i

    elif sum(tf[i]) > maximum_2:
      maximum_4 = maximum_3
      fourth_max_word_tf = third_max_word_tf
      maximum_3 = maximum_2
      third_max_word_tf = second_max_word_tf
      maximum_2 = sum(tf[i])
      second_max_word_tf = i

    elif (sum(tf[i]) > maximum_3):
      maximum_4 = maximum_3
      fourth_max_word_tf = third_max_word_tf
      maximum_3 = sum(tf[i])
      third_max_word_tf = i

    elif (sum(tf[i]) > maximum_4):
      maximum_4 = sum(tf[i])
      fourth_max_word_tf = i

# print the results
print(max_word_tf)
print(second_max_word_tf)
print(third_max_word_tf)
print(fourth_max_word_tf)

hurrican
ian
florida
carolina


#**10. Print the top 20 ranked documents for a given query**

In [20]:
rankings = ["tf_idf", "own_model", "bm25", "word2vec"] # ranking models

In [21]:
query = max_word_tf + " " + second_max_word_tf

for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 3655 for the searched query hurrican ian with the ranking method tf_idf:

doc_id= doc_985, with tweet: Is that you #HurricaneIan? https://t.co/5K6V610Ivf, username: Rozrunsabit, date: Fri Sep 30 17:53:07 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [12, 25]}], likes: 2, retweets: 0, URL: https://t.co/5K6V610Ivf
doc_id= doc_640, with tweet: Hurricane Ian before and after #HurricaneIan https://t.co/XZstkI2pN2, username: kadenfields8, date: Fri Sep 30 18:07:50 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [31, 44]}], likes: 2, retweets: 0, URL: https://t.co/XZstkI2pN2
doc_id= doc_2597, with tweet: That's how it is 😂
#HurricaneIan 🌀 https://t.co/9BSWTb2Bwf, username: ReasonsImBroke, date: Fri Sep 30 15:45:20 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [19, 32]}], likes: 0, retweets: 0, URL: https://t.co/9BSWTb2Bwf
doc_id= doc_1992, with tweet: Before + After. #HurricaneIan https://t.co/kYIGeNTeD2, username: MarsRoverMapper, d

In [22]:
query = max_word_tf + " " + third_max_word_tf
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 892 for the searched query hurrican florida with the ranking method tf_idf:

doc_id= doc_3949, with tweet: My thoughts 💭🙏🏻🙏🏻🙏🏻 with #Florida after the Hugs #HurricaneIan ... #PrayingForFlorida https://t.co/8wwPQzZ207, username: GARVEY66, date: Fri Sep 30 14:35:50 +0000 2022, hashtags: [{'text': 'Florida', 'indices': [25, 33]}, {'text': 'HurricaneIan', 'indices': [49, 62]}, {'text': 'PrayingForFlorida', 'indices': [67, 85]}], likes: 0, retweets: 0
doc_id= doc_3296, with tweet: This Florida flamboyance of flamingos is safe!

https://t.co/puQNCkDooz

#HurricaneIan #Florida, username: nataliealund, date: Fri Sep 30 15:10:08 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [73, 86]}, {'text': 'Florida', 'indices': [87, 95]}], likes: 3, retweets: 0
doc_id= doc_3944, with tweet: No force can be stronger than force of nature #HurricaneIan 
Prayers for South-west Florida!
#Florida 
#FloridaStrong 
@FLGuard @ActionNewsJax @FoxNews @wjxt4 @cnnbrk @BBCWorld 
@ManuG

In [23]:
query = max_word_tf + " " + fourth_max_word_tf
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 412 for the searched query hurrican carolina with the ranking method tf_idf:

doc_id= doc_1335, with tweet: WATCH: After ravaging Florida, a strengthened Hurricane Ian has made landfall in South Carolina and is predicted to enter North Carolina soon.  #Florida #NorthCarolina #SouthCarolina #USA #HurricaneIan #Environment https://t.co/LIAALDsxeP, username: BNNUS, date: Fri Sep 30 17:31:03 +0000 2022, hashtags: [{'text': 'Florida', 'indices': [144, 152]}, {'text': 'NorthCarolina', 'indices': [153, 167]}, {'text': 'SouthCarolina', 'indices': [168, 182]}, {'text': 'USA', 'indices': [183, 187]}, {'text': 'HurricaneIan', 'indices': [188, 201]}, {'text': 'Environment', 'indices': [202, 214]}], likes: 31, retweets: 21, URL: https://t.co/LIAALDsxeP
doc_id= doc_1444, with tweet: #HurricaneIan nears landfall in South Carolina - Will impact both Carolinas and Virginia, reports Karen Graham. https://t.co/qttgs1Stbf, username: digitaljournal, date: Fri Sep 30 17:20:16 +0000 20

In [24]:
query = second_max_word_tf + " " + third_max_word_tf
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 886 for the searched query ian florida with the ranking method tf_idf:

doc_id= doc_3949, with tweet: My thoughts 💭🙏🏻🙏🏻🙏🏻 with #Florida after the Hugs #HurricaneIan ... #PrayingForFlorida https://t.co/8wwPQzZ207, username: GARVEY66, date: Fri Sep 30 14:35:50 +0000 2022, hashtags: [{'text': 'Florida', 'indices': [25, 33]}, {'text': 'HurricaneIan', 'indices': [49, 62]}, {'text': 'PrayingForFlorida', 'indices': [67, 85]}], likes: 0, retweets: 0
doc_id= doc_3296, with tweet: This Florida flamboyance of flamingos is safe!

https://t.co/puQNCkDooz

#HurricaneIan #Florida, username: nataliealund, date: Fri Sep 30 15:10:08 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [73, 86]}, {'text': 'Florida', 'indices': [87, 95]}], likes: 3, retweets: 0
doc_id= doc_3944, with tweet: No force can be stronger than force of nature #HurricaneIan 
Prayers for South-west Florida!
#Florida 
#FloridaStrong 
@FLGuard @ActionNewsJax @FoxNews @wjxt4 @cnnbrk @BBCWorld 
@ManuGulati

In [25]:
query = second_max_word_tf + " " + fourth_max_word_tf
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 409 for the searched query ian carolina with the ranking method tf_idf:

doc_id= doc_1335, with tweet: WATCH: After ravaging Florida, a strengthened Hurricane Ian has made landfall in South Carolina and is predicted to enter North Carolina soon.  #Florida #NorthCarolina #SouthCarolina #USA #HurricaneIan #Environment https://t.co/LIAALDsxeP, username: BNNUS, date: Fri Sep 30 17:31:03 +0000 2022, hashtags: [{'text': 'Florida', 'indices': [144, 152]}, {'text': 'NorthCarolina', 'indices': [153, 167]}, {'text': 'SouthCarolina', 'indices': [168, 182]}, {'text': 'USA', 'indices': [183, 187]}, {'text': 'HurricaneIan', 'indices': [188, 201]}, {'text': 'Environment', 'indices': [202, 214]}], likes: 31, retweets: 21, URL: https://t.co/LIAALDsxeP
doc_id= doc_1444, with tweet: #HurricaneIan nears landfall in South Carolina - Will impact both Carolinas and Virginia, reports Karen Graham. https://t.co/qttgs1Stbf, username: digitaljournal, date: Fri Sep 30 17:20:16 +0000 2022, h

In [26]:
query = "Landfall in South Carolina"
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 101 for the searched query Landfall in South Carolina with the ranking method tf_idf:

doc_id= doc_2505, with tweet: ⚠️ #HurricaneIan about to make landfall on South Carolina! https://t.co/X61NAwpsx6, username: JoshFitzWx, date: Fri Sep 30 15:50:13 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [3, 16]}], likes: 2, retweets: 1, URL: https://t.co/X61NAwpsx6
doc_id= doc_249, with tweet: #HurricaneIan makes landfall in South Carolina... https://t.co/vKik1OyOgQ, username: sfdb, date: Fri Sep 30 18:26:19 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [0, 13]}], likes: 1, retweets: 0
doc_id= doc_1492, with tweet: #HurricaneIan is now making #landfall into #SouthCarolina Coastline! https://t.co/yoTfc6Hr5U, username: WeatherEastern, date: Fri Sep 30 17:14:38 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [0, 13]}, {'text': 'landfall', 'indices': [28, 37]}, {'text': 'SouthCarolina', 'indices': [43, 57]}], likes: 1, retweets: 1, URL: htt

In [27]:
query = "Help and recovery during the hurricane disaster"
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 8 for the searched query Help and recovery during the hurricane disaster with the ranking method tf_idf:

doc_id= doc_3560, with tweet: Join @MLS and @RedCross to help people affected by #HurricaneIan. Your donation will help emergency preparedness, response, and recovery from this disaster: https://t.co/4mohwOmeF0 https://t.co/BUC7TeF5Fq, username: MLSWORKS, date: Fri Sep 30 14:57:38 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [51, 64]}], likes: 7, retweets: 1, URL: https://t.co/BUC7TeF5Fq
doc_id= doc_23, with tweet: .@HouseDemocrats just passed a new funding bill to provide help for #HurricaneIan victims:

💵 Up to $18.8 billion in @fema Disaster Relief Fund authority to help Florida and Puerto Rico recover
💵 $2 billion for the Community Development Block Grant Disaster Recovery program https://t.co/NqPpa1InTq, username: RepDarrenSoto, date: Fri Sep 30 18:37:52 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [68, 81]}], likes: 2, retwee

In [28]:
query = "Floodings in South Carolina"
for ranking in rankings: # for each ranking
  ranked_docs = search_docs(query, index, ranking) # we call search_docs() for obtaining the ranked documents containing all the terms in that query
  top = 20

  # We print the top 20 results for each ranking method
  print("\n======================\nTop {} results out of {} for the searched query {} with the ranking method {}:\n".format(top, len(ranked_docs), query, ranking))
  for d_id in ranked_docs[:top]:
      try:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}, URL: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"], tweets_index[d_id]["url"]))
      except:
        print("doc_id= {}, with tweet: {}, username: {}, date: {}, hashtags: {}, likes: {}, retweets: {}".format(d_id, tweets_index[d_id]["text"], tweets_index[d_id]["username"], tweets_index[d_id]["date"], tweets_index[d_id]["hashtags"], tweets_index[d_id]["likes"], tweets_index[d_id]["retweets"])) # the tweet 1217 does not have URL


Top 20 results out of 27 for the searched query Floodings in South Carolina with the ranking method tf_idf:

doc_id= doc_1289, with tweet: Flooding in Garden City, South Carolina 

#GardenCity #SouthCarolina #HurricaneIan https://t.co/5W328kjWkH, username: skyflyer81, date: Fri Sep 30 17:33:51 +0000 2022, hashtags: [{'text': 'GardenCity', 'indices': [42, 53]}, {'text': 'SouthCarolina', 'indices': [54, 68]}, {'text': 'HurricaneIan', 'indices': [69, 82]}], likes: 1, retweets: 0, URL: https://t.co/5W328kjWkH
doc_id= doc_65, with tweet: South Carolina officials has a message to their citizens. 

Those in the path of #HurricaneIan are to get off and stay off the roads until the storm has past.   Most coastal towns are experiencing flooding.

#Charleston #Georgetown #SouthCarolina #Ian, username: Jasamsdestiny, date: Fri Sep 30 18:35:06 +0000 2022, hashtags: [{'text': 'HurricaneIan', 'indices': [81, 94]}, {'text': 'Charleston', 'indices': [208, 219]}, {'text': 'Georgetown', 'indices': [220,