In [1]:
# mount the drive for latter importing the datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**1. Import dictionaries**

In [2]:
# download nltk and the stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# download the rest of dictionaries
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time

# We have added these dictionaries to the ones that were included in lab 1
import json
import re
import csv

#**2. Load the datasets**

In [4]:
docs_path = 'drive/MyDrive/IRWA/Part_1:Text_Processing/Hurricane_Ian_Corpus/data/tw_hurricane_data.json'
tweets = []
# open the JSON file
with open(docs_path) as fp:
    for jsonObj in fp:
        tweetsDict = json.loads(jsonObj)
        tweets.append(tweetsDict) # add the tweets in our array tweets

In [5]:
docs_path_2 = 'drive/MyDrive/IRWA/Part_1:Text_Processing/Hurricane_Ian_Corpus/data/tweet_document_ids_map.csv'
doc_id = {}
# open the CSV file
with open(docs_path_2, newline='') as csvfile:
  spamreader = csv.reader(csvfile, delimiter=' ', quotechar=' ')
  for row in spamreader:
    doc_id[row[0].split()[1]] = row[0].split()[0] # add the doc number as an entry of our dictionary, having the tweet id as the key of this entry

#**3. Text Processing**

In [6]:
def build_terms(tweet):
    """
    Preprocess the text of the tweet by eliminating the url, the people labelled with the @,
    eliminating the punctuation, separating the words after the hashtag, removing stop words, 
    stemming, transforming in lowercase and returning the tokens of the text.
    
    Argument:
    tweet -- string (text) to be pre-processed
    
    Returns:
    tweet - a list of tokens corresponding to the input text after the pre-processing
    """

    stemmer = PorterStemmer() # stemm the words to get the root of the word and avoid having different words that mean the same
    stop_words = set(stopwords.words("english")) # eliminate all the stop words to make efficient queries and documents
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # separate the words without including puntuation marks
    
    tweet = re.sub(r'http\S+', '', tweet) ## delete the url
    tweet = re.sub(r'@\S+', '', tweet) ## delete the word after @ (so the people labelled)
    tweet = " ".join([a for a in re.split('([A-Z][a-z]+)', tweet) if a]) ## separate the hashtags in words according to the capital letters
    tweet = tweet.replace("_", " ") ## eliminate the _ (it is the only punctuation mark that is not deleted with tokenize)
    tweet = tweet.lower() ## transform in lowercase
    tweet = tokenizer.tokenize(tweet) ## tokenize the text to get a list of terms and remove punctuation marks
    tweet=[i for i in tweet if i not in stop_words]  ## eliminate the stopwords
    tweet=[stemmer.stem(i) for i in tweet] ## perform stemming

    return tweet

In [7]:
def create_index(tweets):
  """
  Create the inverted index and the tweets dictionary

  Argument:
  tweets -- collection of tweets
  
  Returns:
  index -- the inverted index. Contains the terms as keys and in which tweets (appear as the document number related to the tweet id)
  and in which position inside this tweet appears each term
  tweets-index -- the tweet's dictionary. Contains an entry for each tweet which key is the document number related with
  the tweet's id. Each tweet has its text, username, date, hashtags, number of likes, number of retweets and url if they exist 
  """
  index = defaultdict(list) # We create the inverted index
  tweets_index = {} # We create the tweets dictionary
  counter = 0

  for tweet in tweets:
    # for each tweet we create a dictionary containing the text, username, date, hashtags, number of likes, number of retweets and url if they exist
    tweet_dict = {}
    try:
      tweet_dict["text"] = tweet["full_text"]
    except:
      pass
    try:
      tweet_dict["username"] = tweet['user']['screen_name']
    except:
      pass
    try:
      tweet_dict["date"] = tweet["created_at"]
    except:
      pass
    try:
      tweet_dict["hashtags"] = []
      for i in range(0, len(tweet["entities"]["hashtags"])):
        tweet_dict["hashtags"].append(tweet["entities"]["hashtags"][i])
    except:
      pass
    try:
      tweet_dict["likes"] = tweet["favorite_count"]
    except:
      pass
    try:
      tweet_dict["retweets"] = tweet["retweet_count"]
    except:
      pass
    try:
      tweet_dict["url"] = tweet["entities"]["media"][0]["url"]
    except:
      pass

    tweets_index[doc_id[str(tweet["id"])]] = tweet_dict # save the tweet in tweets index by the document number related with the tweet id

    terms = build_terms(tweet["full_text"]) # call build terms for processing the text of the tweet

    if counter <= 10:
      # print the tweet text and terms for checking it the result is okay (now we only do this for the first tweet but before delivering we have checked more tweets)
      print("Original full text of the tweet': \n{}".format(tweet["full_text"]))
      print("Terms after processing the text': \n{}".format(terms))
      counter += 1

    current_page_index = {}

    for position, term in enumerate(terms): # loop over all terms
        try:
            # if the term is already in the index for the current page append the position
            current_page_index[term][1].append(position)
        except:
            # else add the new term as dict key and set the document number corresponding to this tweet and the position where the term appears in this tweet
            current_page_index[term]=[doc_id[str(tweet["id"])], array('I',[position])] #'I' indicates unsigned int (int in Python)
        
    #merge the current page index with the main index
    for term_page, posting_page in current_page_index.items():
        index[term_page].append(posting_page)
    
  return index, tweets_index

#**4. Check the results**

In [8]:
start_time = time.time()
index, tweets_index = create_index(tweets) # run create_index() for creating the inverted index and the tweets index
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2))) # calculate how much time does the process last

Original full text of the tweet': 
So this will keep spinning over us until 7 pm…go away already. #HurricaneIan https://t.co/VROTxNS9rz
Terms after processing the text': 
['keep', 'spin', 'us', '7', 'pm', 'go', 'away', 'alreadi', 'hurrican', 'ian']
Original full text of the tweet': 
Our hearts go out to all those affected by #HurricaneIan. We wish everyone on the roads currently braving the conditions safe travels. 💙
Terms after processing the text': 
['heart', 'go', 'affect', 'hurrican', 'ian', 'wish', 'everyon', 'road', 'current', 'brave', 'condit', 'safe', 'travel']
Original full text of the tweet': 
Kissimmee neighborhood off of Michigan Ave. 
#HurricaneIan https://t.co/jf7zseg0Fe
Terms after processing the text': 
['kissimme', 'neighborhood', 'michigan', 'ave', 'hurrican', 'ian']
Original full text of the tweet': 
I have this one tree in my backyard that scares me more than the poltergeist tree when it’s storming and windy like this. #scwx #HurricaneIan
Terms after processing the 

In [9]:
# check the first index results for a term
print("Index results for the term 'hurricane': {}\n".format(index['hurricane']))
print("First 10 Index results for the term 'hurrican': \n{}".format(index['hurrican'][:10]))

Index results for the term 'hurricane': []

First 10 Index results for the term 'hurrican': 
[['doc_1', array('I', [8])], ['doc_2', array('I', [3])], ['doc_3', array('I', [4])], ['doc_4', array('I', [10])], ['doc_5', array('I', [3])], ['doc_6', array('I', [6, 9, 23])], ['doc_7', array('I', [5])], ['doc_8', array('I', [7, 11])], ['doc_9', array('I', [3])], ['doc_10', array('I', [3])]]


In [10]:
# check the tweets index result for a tweet
print(tweets_index["doc_4"])

{'text': 'I have this one tree in my backyard that scares me more than the poltergeist tree when it’s storming and windy like this. #scwx #HurricaneIan', 'username': 'spiralgypsy', 'date': 'Fri Sep 30 18:38:57 +0000 2022', 'hashtags': [{'text': 'scwx', 'indices': [122, 127]}, {'text': 'HurricaneIan', 'indices': [128, 141]}], 'likes': 0, 'retweets': 0}
