# HW02 - Mourning Tweets

## Import Data

In [1]:
# Import Libraries
import nltk
import csv
import string
import numpy as np
import pandas as pd

from IPython.display import clear_output

In [2]:
# Download nltk corpus
#nltk.download()

# Import fnmourning dataset
raw_df = pd.read_csv('./data/fnmourning.csv', sep = ',')

# Glimpse to raw dataset
raw_df.head()

Unnamed: 0,text,tag,lang,emoticon
0,"Hoy uno de mis tíos falleció por COVID-19, no ...",mourning,es,0
1,"Hoy falleció mi abuela. Y eso, sin velorio, si...",mourning,es,0
2,Muere el primer médico en activo por Covid-19 ...,mourning,es,0
3,Historia de una victoria. Historia de los últi...,mourning,es,1
4,He visto dos personas a las que les tomé cariñ...,mourning,es,0


In [3]:
# Replace string tags with binary tags
raw_df.tag = raw_df.tag.replace('mourning',1)
raw_df.tag = raw_df.tag.replace('no mourning',0)

# Split dataset by language
es_df = raw_df[(raw_df.lang == "es")]
en_df = raw_df[(raw_df.lang == "en")]

## Mourning Lexicons

### Process Tweets

In [4]:
# Tweet tokenizer 
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

def process_tweet(tokenizer, tweet, stop_words):
    """ Applies standard pre-processing to given tweet.
    
    Args:
        text (str): tweet to preprocess.
        language (str): languague of the tweet.
        stop_words (str): list of words to be removed.
    
    Returns:
        list: preprocessed text.
    
    """
    
    # Remove punctuations and convert characters to lower case
    tweet_nopunct = "".join([char.lower() for char in tweet if char not in string.punctuation]) 
    
    # Tokenize words
    tk_tweet = tokenizer.tokenize(tweet_nopunct)
    
    # Remove stop words
    relevant_words=[]
    for word in tk_tweet:
        if word not in stop_words:
            relevant_words.append(word)
    
    #print(relevant_words)
    
    # Returns processed text
    return relevant_words
    
# Creates tweet tokenizer
tt = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

# Tokenize tweets in ES dataset
es_stop_words = stopwords.words('spanish')
es_tweets = []
for tweet in es_df.text:
    # Tokenize tweets
    es_tweets.append(process_tweet(tt, tweet, es_stop_words))

# Tokenize tweets in EN dataset
en_stop_words = stopwords.words('english')
en_tweets = []
for tweet in en_df.text:
    # Tokenize tweets
    en_tweets.append(process_tweet(tt, tweet, en_stop_words))

In [5]:
# Build dictionaries
from gensim import corpora

# Convert tweets to ES dictionary
es_dict = corpora.Dictionary(es_tweets)

# Convert tweets to ES dictionary
en_dict = corpora.Dictionary(en_tweets)

# Glimpse to dictionaries
print('ES Dictionary:')
print(es_dict)
print('')
print('EN Dictionary:')
print(en_dict)

ES Dictionary:
Dictionary(38653 unique tokens: ['19', 'conciencia', 'condolencias', 'covid', 'cuenta']...)

EN Dictionary:
Dictionary(32643 unique tokens: ['19', 'away', 'burton', 'clinicians', 'continue']...)


In [6]:
# Create BOW Representation of each tweet
es_corpus = []
en_corpus = []

# ES tweets
for tweet in es_tweets:
    es_corpus.append(es_dict.doc2bow(tweet))
    
# EN tweets
for tweet in en_tweets:
    en_corpus.append(en_dict.doc2bow(tweet))

### Build Lexicons

In [7]:
def create_lexicons(dictionary, tweet_corpus, tags):
    """ Create lexicons for mourning tweets.
    
    Args:
        dictionary (gensim.corpora.dictionary.Dictionary): Dictionary for the es_.
        tweet_corpus (gensim.corpora.mmcorpus.MmCorpus): Tweet corpus with tweets in BOW Model.
        tags (numpy.array): Array with mourning tags (1=Mourning, 0=No mourning).
    
    Returns:
        pos (numpy.array): Negative score of each term.
        neg (numpy.array): Positive score of each term.
    
    """
    
    # Create pos/neg arrays
    pos_count = np.zeros(len(dictionary))
    neg_count = np.zeros(len(dictionary)) 
    
    # Tweet corpus loop
    for i, tweet in enumerate(tweet_corpus):
        
        # Term loop
        for term in tweet:
            
            # Add to pos/neg array by tag
            if tags[i]:
                pos_count[term[0]] += term[1]
            else:
                neg_count[term[0]] += term[1]

    # Convert counts to score (scaled likelihood)
    p_w = (pos_count+neg_count)/sum(pos_count+neg_count)
    
    #pos_score = (pos_count/(pos_count+neg_count))#/p_w
    #neg_score = (neg_count/(pos_count+neg_count))#/p_w
    pos_score = pos_count/sum(pos_count)
    neg_score = neg_count/sum(neg_count)

    # Return pos/neg score of terms
    return pos_score, neg_score

In [8]:
# Retrieve scores for each language
es_pos, es_neg = create_lexicons(es_dict, es_corpus, es_df.tag.values)
en_pos, en_neg = create_lexicons(en_dict, en_corpus, en_df.tag.values)

In [9]:
# Initial dataframe with terms
es_lexicons = pd.DataFrame(list(es_dict.token2id.items()),columns = ['Term','Id'])
en_lexicons = pd.DataFrame(list(en_dict.token2id.items()),columns = ['Term','Id'])

# Add pos/neg Scores
es_lexicons['PosScore'] = es_pos
es_lexicons['NegScore'] = es_neg
#es_lexicons['S. O.'] = es_so

en_lexicons['PosScore'] = en_pos
en_lexicons['NegScore'] = en_neg
#en_lexicons['S. O.'] = en_so

In [10]:
# Print top 30 ES Lexicons for mourning
pd.set_option("max_rows", 30)
es_lexicons.sort_values(by='PosScore', ascending=False).head(30)

Unnamed: 0,Term,Id,PosScore,NegScore
2,condolencias,2,0.013976,0.000247
84,qepd,84,0.013887,0.000589
364,pesar,364,0.011567,5.9e-05
3,covid,3,0.010335,0.018806
0,19,0,0.010202,0.017333
538,pena,538,0.009469,0.00033
21,cuarentena,21,0.009269,0.01295
36,familia,36,0.009247,0.001143
72,coronavirus,72,0.007515,0.015955
635,dolor,635,0.006894,0.000306


In [11]:
# Print top 30 EN Lexicons for mourning
en_lexicons.sort_values(by='PosScore', ascending=False).head(30)

Unnamed: 0,Term,Id,PosScore,NegScore
5,covid,5,0.024279,0.03161
13,rip,13,0.0214,0.002825
0,19,0,0.020889,0.029534
71,’,71,0.013228,0.013723
29,coronavirus,29,0.01205,0.020277
100,peace,100,0.010016,0.00335
101,rest,101,0.00954,0.000837
56,condolences,56,0.009457,0.000793
241,🙏,241,0.009409,0.001675
183,family,183,0.008922,0.001362


## Classifiers

In [12]:
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

import copy

### Feature representation

In [13]:
def build_feature_representation(dictionary, corpus, remove_emojis=False, emoji_list=None):
    """ Build BOW Model Matrix representation for tweets in corpus.
    
    Args:
        dictionary (gensim.corpora.dictionary.Dictionary): Dictionary for the tweet corpus.
        tweet_corpus (gensim.corpora.mmcorpus.MmCorpus): Tweet corpus with tweets in BOW Model.
        remove_emojis (boolean): Whether to remove or not to remove emojis from dictionary.
        emoji_list (list): Emojis to be removed from the dictionary (only used if remove_emojis = True)
    
    Returns:
        X (numpy.array): Boolean BOW Model Matrix for tweets in corpus.
    
    """
    # Deep copy dictionary
    aux_dict = copy.deepcopy(dictionary)
    
    # Empty Matrix with BOW Model for each corpus
    X = np.zeros((len(corpus),len(dictionary)), dtype = np.bool_)
    
    # Fill Input Matrix
    for i, tweet in enumerate(corpus):
        for term in tweet:
            X[i][term[0]] = 1
            
    # Remove emojis
    if remove_emojis:
        # Get Emoji IDs for Dictionary
        emoji_ids = []
        for emoji in emoji_list:
            try:
                emoji_ids.append(dictionary.token2id[emoji])
            except:
                pass
        
        # Delete Emoji Columns on X Matrix
        X = np.delete(X, emoji_ids, 1)            
            
    return X

In [14]:
# Build Boolean BOW Model WITH Emojis
X_es = build_feature_representation(es_dict, es_corpus)
X_en = build_feature_representation(en_dict, en_corpus)

In [15]:
# Import emoji list (Download emoji Module)
from emoji import UNICODE_EMOJI

# Merge ES and EN Emoji List
emoji_list = list({**UNICODE_EMOJI['en'], **UNICODE_EMOJI['es']}.keys())

# Build Boolean BOW Model WITHOUT Emojis
X_es_no_emojis = build_feature_representation(es_dict, es_corpus, remove_emojis=True, emoji_list=emoji_list)
X_en_no_emojis = build_feature_representation(es_dict, es_corpus, remove_emojis=True, emoji_list=emoji_list)

In [21]:
X_es.shape

(13168, 38653)

In [23]:
X_es_no_emojis.shape

(13168, 38148)

### Training models

In [16]:
# Split Data


In [17]:
# Naive Bayes


In [18]:
# Logistic Regression


In [19]:
# Decision Tree


In [20]:
# Random Forest


### Testing models

## Feature Importance