# Part 3: Ranking

## Imports

In [3]:
import pandas as pd
from collections import defaultdict
import numpy as np
import re
import nltk
import contractions
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot 

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Josep
[nltk_data]     Alet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Josep
[nltk_data]     Alet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Josep
[nltk_data]     Alet\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
df = pd.read_csv('processed_tweets.csv').drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,DocID,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,doc_1,keep spin u 7 pmgo away alreadi hurricaneian,suzjdean,Fri Sep 30 18:39:08 +0000 2022,['HurricaneIan'],0,0,https://twitter.com/suzjdean/status/1575918182...
1,doc_2,heart go affect hurricaneian wish everyon road...,lytx,Fri Sep 30 18:39:01 +0000 2022,['HurricaneIan'],0,0,https://twitter.com/lytx/status/15759181518623...
2,doc_3,kissimme neighborhood michigan ave hurricaneian,CHeathWFTV,Fri Sep 30 18:38:58 +0000 2022,['HurricaneIan'],0,0,https://twitter.com/CHeathWFTV/status/15759181...
3,doc_4,one tree backyard scare poltergeist tree storm...,spiralgypsy,Fri Sep 30 18:38:57 +0000 2022,"['scwx', 'HurricaneIan']",0,0,https://twitter.com/spiralgypsy/status/1575918...
4,doc_5,ashleyruizwx stephan89441722 lilmizzheidi mrsn...,Blondie610,Fri Sep 30 18:38:53 +0000 2022,['HurricaneIan'],0,0,https://twitter.com/Blondie610/status/15759181...


### From these results of part 2 we had concluded that some queries could be:

In [8]:
Q1="florida hurrican"

Q2="help people in florida"

Q3="hurrican ian major damages"

Q4="storm impact in Florida"

Q5="floodings in the south"

# a) Scoring using TF-IDF + Cosine Similarity 
Classical scoring, we have also seen during the practical labs

In [14]:
def extract_tweet_vocabulary(tweet, docId):
    return {term: docId for term in tweet.split(' ')}

In [15]:
def merge_dicts(dicts):
    vocab = defaultdict(list)
    for dic in dicts:
        for term in dic:
            vocab[term].append(dic[term])
    return dict(vocab)

In [16]:
tweets_dicts = map(extract_tweet_vocabulary, df['Tweet'], df['DocID'])
vocabulary = merge_dicts(tweets_dicts)
print(f"Vocabulary has {len(vocabulary)} words")

Vocabulary has 10671 words


In [17]:
def build_TF_IDF(df, vocabulary):
    terms = vocabulary.keys()
    docs = df.DocID
    N = len(docs)
    tf_idf = dict()
    
    for doc in docs:
        tf_idf[doc] = {}
    
    for term in terms:
        for doc in vocabulary[term]:
            tf = df[df.DocID == doc].Tweet.iloc[0].split().count(term)
            if tf>0:
                df_i = len(vocabulary[term])
                tf_idf[doc][term] = (1+np.log(tf))*np.log(N/df_i)
            else:
                tf_idf[doc][term] = 0
    return tf_idf

In [18]:
def find_doc2norm(tf_idf):
    docs = tf_idf.keys()
    doc2norm = {}
    for doc in docs:
        doc2norm[doc] = np.linalg.norm(np.array(list(tf_idf[doc].values())))
    return doc2norm

In [19]:
def find_term2sum(tf_idf):
    terms = vocabulary.keys()
    docs = tf_idf.keys()
    term2sum = {term:0 for term in terms}
    for doc in docs:
        for term, value in tf_idf[doc].items():
            term2sum[term] += value
    term2sum = {t: v for t, v in sorted(term2sum.items(), key=lambda item: item[1], reverse=True)}
    return term2sum

In [21]:
tf_idf = build_TF_IDF(df, vocabulary)

In [10]:
doc2norm = find_doc2norm(tf_idf)
term2sum = find_term2sum(tf_idf)

In [1]:
def build_terms(line):
    """
    Preprocess the tweet content removing stop words, contractionas and urls
    lemmatizing and stemming words to keep a single word for each family of words
    transforming in lowercase, removing special characters [#, @, .] 
    (since it is included in another column on the dataframe)
    
    return tokenized tweet (list of words after applying the previous steps).
    
    Argument:
    line -- string (tweet) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
    ## START CODE
    line = line.lower() ##Transform in lowercase
    line = re.sub(r"[^A-Za-z 0-9 ']+", '', line) # remove emojis and any other special character
    stop_words = set(stopwords.words("english")) # removing stopwords
    line = ' '.join([contractions.fix(x) for x in line.split(' ')]) # expaning verb abreviations: i'll -> i will 
    line = re.sub("'", '', line) 
    line = line.split(' ')
    line = [x for x in line if x and x not in stop_words]
    line = filter(lambda x:x[0:5]!='https', line) # removing links
    line = [x for x in line]
    ps = PorterStemmer() 
    lemmatizer = WordNetLemmatizer() 
    line = [lemmatizer.lemmatize(x) for x in line] # keeping the singular form of each noun: feet --> foot
    line = [ps.stem(x) for x in line] # keeping the root of each family of words: dancer --> danc
    
    ## END CODE
    return ' '.join(line)

In [2]:
def rank(query, tf_idf, vocabulary):
    N = len(tf_idf.keys())
    terms_query = build_terms(query).split()
    terms_q = list(set(terms_query))
    tf_idf_q = dict()
    
    for term_q in terms_q:
        f_iq = terms_query.count(term_q)
        if term_q not in vocabulary:
            continue
        df_i = len([doc for doc in vocabulary[term_q] if doc in tf_idf.keys()])
        if df_i>0:
            tf_idf_q[term_q] = (1+np.log(f_iq))*np.log(N/df_i)
        else:
            tf_idf_q[term_q]=0
    q_norm = np.linalg.norm(np.array(list(tf_idf_q.values())))
    doc2score = dict()
    
    doc2norm = find_doc2norm(tf_idf)
    
    for doc, dix in tf_idf.items():
        dot_product = 0
        for term, value in dix.items():
            if term in tf_idf_q.keys():
                dot_product += value * tf_idf_q[term]
        doc2score[doc] = dot_product / (doc2norm[doc] * q_norm)
    doc2score = {t: v for t, v in sorted(doc2score.items(), key=lambda item: item[1], reverse=True)}
    return doc2score

In [9]:
doc2score_Q1 = rank(Q1, tf_idf, vocabulary)
top = 10
print(f"Top {top} docs for query 1:")
for i in range(top):
    doc, score = list(doc2score_Q1.items())[i]
    print(f"\t{doc} -> {score} -> {df[df.DocID == doc].Tweet.iloc[0]}")

NameError: name 'tf_idf' is not defined

# b) Own score + cosine similarity

Here the task is to create a new score, and it’s up to you to create a new one.


# c) BM25

In [9]:
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
a = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}
print(a)

{3: 4, 4: 3, 1: 2, 2: 1, 0: 0}


In [10]:
def BM25(query, vocabulary, k1, b, L_ave, doc_contents):
    '''
    vocabulary = inverted index, dictionary with keys = terms an values = list of doc_ids
    k1 = value to regulate xx
    b = value to regulate yy
    L_ave = average length of docs in words
    doc_contents = dictionary where keys = doc_ids and values = list of terms (after text processing)
    '''
    
    RSV = dict()
    N = len(doc_contents)
    terms_q = list(set(build_terms(query).split()))
    idf = dict()
    
    # calculate idf for each term in the query 
    for t in terms_q:
        f_tq = terms_query.count(t)
        if t not in vocabulary:
            continue
        df_t = len(vocabulary[t])
        idf[t] = np.log(N/df_t)
        
    # calculate RSVd for each document  
    for doc in doc_contents.keys():
        RSV[doc] = 0
        Ld = len(doc_contents[doc])
        for t in idf.keys():
            tf_t_d = doc_contents[doc].count(t)

            second_term = ((k1+1)*tf_t_d) / (k1*((1-b)+b*(Ld/L_ave))+tf_t_d)
            RSV[doc] += idf[t]*second_term
            
    return {k: v for k, v in sorted(RSV.items(), key=lambda item: item[1], reverse=True)}
            