In [1]:
import warnings
warnings.filterwarnings("ignore")
import nltk

In [2]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
#from numpy import ndarray as nda
import time
import pandas as pd
import json
import contractions
import re
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/nasar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/nasar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nasar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# 1) Reading the tweets to doc mapping 

In [3]:
docs_path = 'data/tweet_document_ids_map.csv'

# reading file line by line, each one is a mapping for a specific tweet 
with open(docs_path) as fp:
    lines = fp.read().split("\n") 
lines = [l for l in lines if l != ""] # ensure there are no empty lines
print("Total number of tweet documents in the corpus: {}".format(len(lines)))

# we store pairs (tweetId, docId) into 2 dictionaries 

tweet2doc = dict()
doc2tweet = dict()

for line in lines:
    docID, tweetID = tuple(line.split("\t"))
    tweet2doc[tweetID] = docID
    doc2tweet[docID] = tweetID

Total number of tweet documents in the corpus: 4000


# 2) Read tw_hurricane_data.json into a pandas Dataframe, keeping relevant fields

In [4]:
tweets_path = 'data/tw_hurricane_data.json'
with open(tweets_path) as fp:
    tweets = fp.read().split("\n") # each tweet is a new line 
tweets = [t for t in tweets if t != ""]
print("Total number of tweets in the corpus: {}".format(len(tweets)))

Total number of tweets in the corpus: 4000


In [5]:
tweets_json=[]
for t in tweets:
    tweets_json.append(json.loads(t))

In [6]:
for c in tweets_json:
    entities = c["entities"]
    c["hashtags"] = []
    for hashtag in entities["hashtags"]:
        c["hashtags"].append(hashtag["text"])
    c["username"] = c["user"]["screen_name"]
    c["docID"] = tweet2doc.get(c["id_str"]) #use the mapping to obtain the corresponding doc id of each tweet 
    c["url"] = f"https://twitter.com/{c['username']}/status/{c['id_str']}"

# these are the final fields we are left with. 
columns = {
    "docID":"DocID",
    "full_text":"Tweet",
    "username":"Username",
    "created_at":"Date",
    "hashtags":"Hashtags",
    "favorite_count":"Likes",
    "retweet_count":"Retweets",
    "url":"Url"
}

dt_tweets = pd.DataFrame.from_dict(data=tweets_json)[columns.keys()].rename(columns=columns)
dt_tweets.head()

Unnamed: 0,DocID,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,doc_1,So this will keep spinning over us until 7 pm…...,suzjdean,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],0,0,https://twitter.com/suzjdean/status/1575918182...
1,doc_2,Our hearts go out to all those affected by #Hu...,lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],0,0,https://twitter.com/lytx/status/15759181518623...
2,doc_3,Kissimmee neighborhood off of Michigan Ave. \n...,CHeathWFTV,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],0,0,https://twitter.com/CHeathWFTV/status/15759181...
3,doc_4,I have this one tree in my backyard that scare...,spiralgypsy,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",0,0,https://twitter.com/spiralgypsy/status/1575918...
4,doc_5,@AshleyRuizWx @Stephan89441722 @lilmizzheidi @...,Blondie610,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],0,0,https://twitter.com/Blondie610/status/15759181...


# 3) Processing the tweets content too keep a useful structure for future index creation 

In [7]:
def build_terms(line):
    """
    Preprocess the tweet content removing stop words, contractionas and urls
    lemmatizing and stemming words to keep a single word for each family of words
    transforming in lowercase, removing special characters [#, @, .] 
    (since it is included in another column on the dataframe)
    
    return tokenized tweet (list of words after applying the previous steps).
    
    Argument:
    line -- string (tweet) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
    ## START CODE
    line = line.lower() ##Transform in lowercase
    line = re.sub(r"[^A-Za-z 0-9 ']+", '', line) # remove emojis and any other special character
    stop_words = set(stopwords.words("english")) # removing stopwords
    line = ' '.join([contractions.fix(x) for x in line.split(' ')]) # expaning verb abreviations: i'll -> i will 
    line = re.sub("'", '', line) 
    line = line.split(' ')
    line = [x for x in line if x and x not in stop_words]
    line = filter(lambda x:x[0:5]!='https', line) # removing links
    line = [x for x in line]
    ps = PorterStemmer() 
    lemmatizer = WordNetLemmatizer() 
    line = [lemmatizer.lemmatize(x) for x in line] # keeping the singular form of each noun: feet --> foot
    line = [ps.stem(x) for x in line] # keeping the root of each family of words: dancer --> danc
    
    ## END CODE
    return ' '.join(line)

In [8]:
# example of a sentence transformation
build_terms("Where are his feet pointing at, alex ✨? I'm  going to buy the gardener's cats' food from this link https://hola.com")

'foot point alex go buy garden cat food link'

In [9]:
# example of tweet and its transformation
print(dt_tweets.iloc[3]['Tweet'])
build_terms(dt_tweets.iloc[3]['Tweet'])

I have this one tree in my backyard that scares me more than the poltergeist tree when it’s storming and windy like this. #scwx #HurricaneIan


'one tree backyard scare poltergeist tree storm windi like scwx hurricaneian'

In [10]:
tweets_processed = dt_tweets.copy()
for index, row in tweets_processed.iterrows():
    tweets_processed["Tweet"][index] = build_terms(row['Tweet'])
tweets_processed.head()

Unnamed: 0,DocID,Tweet,Username,Date,Hashtags,Likes,Retweets,Url
0,doc_1,keep spin u 7 pmgo away alreadi hurricaneian,suzjdean,Fri Sep 30 18:39:08 +0000 2022,[HurricaneIan],0,0,https://twitter.com/suzjdean/status/1575918182...
1,doc_2,heart go affect hurricaneian wish everyon road...,lytx,Fri Sep 30 18:39:01 +0000 2022,[HurricaneIan],0,0,https://twitter.com/lytx/status/15759181518623...
2,doc_3,kissimme neighborhood michigan ave hurricaneian,CHeathWFTV,Fri Sep 30 18:38:58 +0000 2022,[HurricaneIan],0,0,https://twitter.com/CHeathWFTV/status/15759181...
3,doc_4,one tree backyard scare poltergeist tree storm...,spiralgypsy,Fri Sep 30 18:38:57 +0000 2022,"[scwx, HurricaneIan]",0,0,https://twitter.com/spiralgypsy/status/1575918...
4,doc_5,ashleyruizwx stephan89441722 lilmizzheidi mrsn...,Blondie610,Fri Sep 30 18:38:53 +0000 2022,[HurricaneIan],0,0,https://twitter.com/Blondie610/status/15759181...


# 4) Exporting processed tweets to a csv file 

In [11]:
from pathlib import Path  
filepath = Path('processed_tweets.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
tweets_processed.to_csv(filepath)  