# HW02 - Mourning Tweets

## Import Data

In [1]:
# Import Libraries
import nltk
import csv
import string
import numpy as np
import pandas as pd

from IPython.display import clear_output

In [2]:
# Download nltk corpus
#nltk.download()

# Import fnmourning dataset
raw_df = pd.read_csv('./data/fnmourning.csv', sep = ',')

# Glimpse to raw dataset
raw_df.head()

Unnamed: 0,text,tag,lang,emoticon
0,"Hoy uno de mis tíos falleció por COVID-19, no ...",mourning,es,0
1,"Hoy falleció mi abuela. Y eso, sin velorio, si...",mourning,es,0
2,Muere el primer médico en activo por Covid-19 ...,mourning,es,0
3,Historia de una victoria. Historia de los últi...,mourning,es,1
4,He visto dos personas a las que les tomé cariñ...,mourning,es,0


In [3]:
# Replace string tags with binary tags
raw_df.tag = raw_df.tag.replace('mourning',1)
raw_df.tag = raw_df.tag.replace('no mourning',0)

# Split dataset by language
es_df = raw_df[(raw_df.lang == "es")]
en_df = raw_df[(raw_df.lang == "en")]

## Lexicons

In [4]:
# Tweet tokenizer 
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

def process_tweet(tokenizer, tweet, stop_words):
    """ Applies standard pre-processing to given tweet.
    
    Args:
        text (str): tweet to preprocess.
        language (str): languague of the tweet.
        stop_words (str): list of words to be removed.
    
    Returns:
        list: preprocessed text.
    
    """
    
    # Remove punctuations and convert characters to lower case
    tweet_nopunct = "".join([char.lower() for char in tweet if char not in string.punctuation]) 
    
    # Tokenize words
    tk_tweet = tokenizer.tokenize(tweet_nopunct)
    
    # Remove stop words
    relevant_words=[]
    for word in tk_tweet:
        if word not in stop_words:
            relevant_words.append(word)
    
    #print(relevant_words)
    
    # Returns processed text
    return relevant_words
    
# Creates tweet tokenizer
tt = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

# Tokenize tweets in ES dataset
es_stop_words = stopwords.words('spanish')
es_tweets = []
for tweet in es_df.text:
    # Tokenize tweets
    es_tweets.append(process_tweet(tt, tweet, es_stop_words))

# Tokenize tweets in EN dataset
en_stop_words = stopwords.words('english')
en_tweets = []
for tweet in en_df.text:
    # Tokenize tweets
    en_tweets.append(process_tweet(tt, tweet, en_stop_words))

In [5]:
# Build dictionaries
from gensim import corpora

# Convert tweets to ES dictionary
es_dict = corpora.Dictionary(es_tweets)

# Convert tweets to ES dictionary
en_dict = corpora.Dictionary(en_tweets)

# Glimpse to dictionaries
print('ES Dictionary:')
print(es_dict)
print('')
print('EN Dictionary:')
print(en_dict)

ES Dictionary:
Dictionary(38653 unique tokens: ['19', 'conciencia', 'condolencias', 'covid', 'cuenta']...)

EN Dictionary:
Dictionary(32643 unique tokens: ['19', 'away', 'burton', 'clinicians', 'continue']...)


In [6]:
# Create BOW Representation of each tweet
es_corpus = []
en_corpus = []

# ES tweets
for tweet in es_tweets:
    es_corpus.append(es_dict.doc2bow(tweet))
    
# EN tweets
for tweet in en_tweets:
    en_corpus.append(en_dict.doc2bow(tweet))

In [7]:
def create_lexicons(dictionary, tweet_corpus, tags):
    """ Create lexicons for mourning tweets.
    
    Args:
        dictionary (gensim.corpora.dictionary.Dictionary): Dictionary for the es_.
        tweet_corpus (gensim.corpora.mmcorpus.MmCorpus): Tweet corpus with tweets in BOW Model.
        tags (numpy.array): Array with mourning tags (1=Mourning, 0=No mourning).
    
    Returns:
        pos (numpy.array): Negative score of each term.
        neg (numpy.array): Positive score of each term.
    
    """
    
    # Create pos/neg arrays
    pos = np.zeros(len(dictionary))
    neg = np.zeros(len(dictionary)) 
    
    # Tweet corpus loop
    for i, tweet in enumerate(tweet_corpus):
        
        # Term loop
        for term in tweet:
            
            # Add to pos/neg array by tag
            if tags[i]:
                pos[term[0]] += term[1]
            else:
                neg[term[0]] += term[1]

    # Convert counts to score
    pos = pos/sum(pos)
    neg = neg/sum(neg)
    overall = pos - neg

    # Return pos/neg score of terms
    return pos, neg, overall

In [8]:
# Retrieve scores for each language
es_pos, es_neg, es_overall = create_lexicons(es_dict, es_corpus, es_df.tag.values)
en_pos, en_neg, en_overall = create_lexicons(en_dict, en_corpus, en_df.tag.values)

In [9]:
# Initial dataframe with terms
es_lexicons = pd.DataFrame(list(es_dict.token2id.items()),columns = ['Term','Id'])
en_lexicons = pd.DataFrame(list(en_dict.token2id.items()),columns = ['Term','Id'])

# Add pos/neg Scores
es_lexicons['PosScore'] = es_pos
es_lexicons['NegScore'] = es_neg
es_lexicons['Score'] = es_overall

en_lexicons['PosScore'] = en_pos
en_lexicons['NegScore'] = en_neg
en_lexicons['Score'] = en_overall

In [10]:
# Print top 30 ES Lexicons for mourning
pd.set_option("max_rows", 30)
es_lexicons.sort_values(by='Score', ascending=False).head(30)

Unnamed: 0,Term,Id,PosScore,NegScore,Score
2,condolencias,2,0.013976,0.000247,0.013729
84,qepd,84,0.013887,0.000589,0.013298
364,pesar,364,0.011567,5.9e-05,0.011508
538,pena,538,0.009469,0.00033,0.009139
36,familia,36,0.009247,0.001143,0.008104
635,dolor,635,0.006894,0.000306,0.006587
67,🙏,67,0.005872,0.000789,0.005083
466,dep,466,0.005162,0.000966,0.004196
47,pésame,47,0.004296,0.000318,0.003978
489,fallecimiento,489,0.003508,0.000177,0.003331


In [11]:
# Print top 30 EN Lexicons for mourning
en_lexicons.sort_values(by='Score', ascending=False).head(30)

Unnamed: 0,Term,Id,PosScore,NegScore,Score
13,rip,13,0.0214,0.002825,0.018575
101,rest,101,0.00954,0.000837,0.008703
56,condolences,56,0.009457,0.000793,0.008664
1188,sorry,1188,0.007863,0.000123,0.00774
241,🙏,241,0.009409,0.001675,0.007735
183,family,183,0.008922,0.001362,0.00756
79,lost,79,0.007661,0.000837,0.006823
363,loss,363,0.007042,0.000335,0.006707
100,peace,100,0.010016,0.00335,0.006666
186,may,186,0.006864,0.001943,0.004921


## Classifiers

## Feature Importance