In [1]:
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/nasar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
#from numpy import ndarray as nda
import time
import pandas as pd
import json

In [3]:
docs_path = 'data/tweet_document_ids_map.csv'
with open(docs_path) as fp:
    lines = fp.readlines()
print("Total number of tweet documents in the corpus: {}".format(len(lines)))

Total number of tweet documents in the corpus: 4000


In [4]:
tweets_path = 'data/tw_hurricane_data.json'
with open(tweets_path) as fp:
    tweets = fp.readlines()
tweets = [t  for t in tweets if t != "\n"]
print("Total number of tweets in the corpus: {}".format(len(tweets)))

Total number of tweets in the corpus: 4000


In [5]:
tweets_json=[]
for t in tweets:
    tweets_json.append(json.loads(t))

In [6]:
lengths = set()
for c in tweets_json:
    entities = c["entities"]
    c["hashtags"] = []
    for hashtag in entities["hashtags"]:
        c["hashtags"].append(hashtag["text"])
    
    c["urls"] = [] # TODO:Where is the URL ???
    #c["urls"] = entities["urls"]
    c["username"] = c["user"]["name"]
    
columns = {
    "full_text":"Tweet",
    "username":"Username",
    "created_at":"Date",
    "hashtags":"Hashtags",
    "favorite_count":"Likes",
    "retweet_count":"Retweets",
    "urls":"Url"
}

dt_tweets = pd.DataFrame.from_dict(data=tweets_json)[columns.keys()].rename(columns=columns)
dt_tweets.head()

Unnamed: 0,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,So this will keep spinning over us until 7 pm‚Ä¶...,Suzüëª,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],0,0,[]
1,Our hearts go out to all those affected by #Hu...,Lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],0,0,[]
2,Kissimmee neighborhood off of Michigan Ave. \n...,Christopher Heath,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],0,0,[]
3,I have this one tree in my backyard that scare...,alex ‚ú®,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",0,0,[]
4,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Tess üíã,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],0,0,[]


## HINT
Take into account that for future queries, the final output must return (when
present) the following information for each of the selected documents: Tweet |
Username | Date | Hashtags | Likes | Retweets | Url (here the ‚ÄúUrl‚Äù means the
tweet link).

## Marta: el url no he sapigut detectar quin era aix√≠ q de moment ho he deixat buit... I els hastags no estic del tot segura el q volem...

## Nasar: els hashtags he fet una llista de tots els hashtags, i la URL ni idea de on √©s.

### OJO: hay algo mal porque mas de la mitad de tweets (2412) no tienen su url...

In [7]:
def build_terms(line):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line = line.lower() ## Transform in lowercase
    line = line.replace('@', '')
    line = line.replace('#', '')
    line = line.replace('.', '')
    line = line.split() ## Tokenize the text to get a list of terms
    line = [x for x in line if x not in stop_words]  ##eliminate the stopwords (HINT: use List Comprehension)
    line = filter(lambda x:x[0:5]!='https', line)
    line = [stemmer.stem(x) for x in line] ## perform stemming (HINT: use List Comprehension)
    ## END CODE
    return line

In [8]:
tweets_processed = dt_tweets.copy()
for index, row in tweets_processed.iterrows():
    tweets_processed["Tweet"][index] = build_terms(row['Tweet'])
tweets_processed.head()

Unnamed: 0,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,"[keep, spin, us, 7, pm‚Ä¶go, away, alreadi, hurr...",Suzüëª,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],0,0,[]
1,"[heart, go, affect, hurricaneian, wish, everyo...",Lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],0,0,[]
2,"[kissimme, neighborhood, michigan, ave, hurric...",Christopher Heath,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],0,0,[]
3,"[one, tree, backyard, scare, poltergeist, tree...",alex ‚ú®,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",0,0,[]
4,"[ashleyruizwx, stephan89441722, lilmizzheidi, ...",Tess üíã,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],0,0,[]


In [9]:
def create_index(lines):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids
    for line in lines:  # Remember, lines contain all documents: article-id | article-title | article-body
        #line_arr = line.split("\")
        #print(line.split("\t"))
        line_arr=line.split("\t")[1]
        page_id = int(line_arr)
        terms = build_terms(''.join(line_arr[1:])) # page_title + page_text
        title = line_arr[1]
        title_index[page_id]=title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‚Äòterm1‚Äô: [current_doc, [list of positions]], ...,‚Äòterm_n‚Äô: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‚Äòweb‚Äô: [1, [0]], ‚Äòretrieval‚Äô: [1, [1,3]], ‚Äòinformation‚Äô: [1, [2]]}

        ## the term ‚Äòweb‚Äô appears in document 1 in positions 0, 
        ## the term ‚Äòretrieval‚Äô appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                
        ## START CODE
                current_page_index[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[page_id, array('I',[position])] #'I' indicates unsigned int (int in Python)
            
        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
        
        ## END CODE                    
                    
    return index, title_index

In [10]:
start_time = time.time()
index, title_index = create_index(lines)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.61 seconds
