# Libraries import

In [None]:
# Data manipulation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import files

import random
import time
import re
import multiprocessing

# Fetch and label data
!pip install whatthelang &> /dev/null
!pip install vaderSentiment &> /dev/null
!pip install afinn &> /dev/null
!pip install twint &> /dev/null
!pip install aiohttp==3.7.0 &> /dev/null

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn 
import twint

import nest_asyncio
nest_asyncio.apply()

# Data cleaning
from whatthelang import WhatTheLang
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec

# Feature scaling
from sklearn.preprocessing import MinMaxScaler

# Machine Learning
from sklearn.model_selection import GridSearchCV, \
                                    train_test_split
from sklearn.pipeline import Pipeline 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn.metrics import ConfusionMatrixDisplay, \
                            confusion_matrix, \
                            classification_report

# Data fetching

The opinion lexicon is a work of [Bing Liu](https://www.cs.uic.edu/~liub/) in the area of sentiment analysis and opinion mining from social media that assembles words classified as `negative` and `positive`, and much more.

The focus of this work is to use both the positive/negative words to address a search with the Twitter API for Tweets that contains this words, therefore are most probable to address a positive/negative feeling.

Since only using a lexicon is not enough to classify an entire sentence as expressing such feeling, other tools will be used to ensure such estimation for the Twitter extracted dataset labels.

In [None]:
# https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
!wget -N http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar &> /dev/null
!unrar e opinion-lexicon-English.rar -y &> /dev/null

negativeWords = pd.read_csv('negative-words.txt', skiprows=30, encoding='ISO8859', names=['negative_words'], squeeze=True)
positiveWords = pd.read_csv('positive-words.txt', skiprows=30, encoding='ISO8859', names=['positive_words'], squeeze=True)

print('Opinion lexicon samples:\n\n' + 
      'Negative words:\n' + 
      str(negativeWords.sample(5)) +'\n\n'+ 
      'Positive words:\n' + 
      str(positiveWords.sample(5)) +'\n')
print('#negative_words = ' + str(len(negativeWords)))
print('#positive_words = ' + str(len(positiveWords)))

Opinion lexicon samples:

Negative words:
3252    perfunctory
4616          venom
3007          naïve
1812          fussy
4767           wrip
Name: negative_words, dtype: object

Positive words:
1516    reverently
370        courage
1974          wise
623     excellence
1149     masterful
Name: positive_words, dtype: object

#negative_words = 4783
#positive_words = 2006


There are many tools for fetch data from Twitter, those are:
- [TwitterSearch](https://twittersearch.readthedocs.io/);
- [Tweepy](https://docs.tweepy.org/);
- [TWINT](https://github.com/twintproject/twint);

`TwitterSearch` share similarities with `Tweepy`, but both need to use the Twitter API. `TWINT` extracts the data in a different manner with a workaround to not use the Twitter API. 

This work will use the `TWINT` for the ease of use that comes no authentication requirements. So below is shown the process of importing the Tweets based in defined keywords as a demosntration. After this test the batch of tweets will e retrieved. 

`TWINT` allows to search on multiple keywords at a time just concatenating the string as `"health OR juice"` the tweets scraped will have `health` or `juice` or both keyword at the time. Using the opinion lexicon this could be achieved using the concatenation of the entire dataset as:

```
negativeWords.str.cat(sep=' OR ')
positiveWords.str.cat(sep=' OR ')
```

But this would crash due to the size of the string. So the strategy of choice is to iterate over each word in the lexicon and scrape a fixed number of tweets based in each word.

For this work each of the `6789` words in the lexicon will return `20` tweets, resulting in a total of `135780`. Further data cleaning will reduce this number.

In [None]:
config = twint.Config()
config.Pandas = True # use pandas integration
config.Search = 'depression' # keyword to search
config.Lang = 'en' # language
config.Limit = 20 # config max. number of tweets returned (min. 20)
config.Since = '2022-01-21 00:00:00' # fetch since data
config.Until = '2022-01-28 00:00:00' # fetch until data
#config.Hide_output = True # hide output of tweets when running
#config.Store_csv = True # allow saving to csv
#config.Output = "custom_out.csv" # save to filename
twint.run.Search(config) 

1486851509358940168 2022-01-27 23:59:58 +0000 <93418> @DVPdirect It's like people minimizing depression with nonsense like "Have you tried just being happy?" and "You've got great things going on, you shouldn't be sad."
1486851498520858627 2022-01-27 23:59:56 +0000 <Richard53116745> @dahuggasystem @elonmusk And cancer, depression etc etc. With Covid or from Covid, what age? Shove it, we are done with this nonsense.
1486851489423413253 2022-01-27 23:59:54 +0000 <cillic> @solarpunkgirl As someone who has fought this feeling going on 38 years now, I need you to know that this is your depression LYING TO YOU.   It will do anything to win. You just have to recognize the lies for what they are.
1486851484260179971 2022-01-27 23:59:52 +0000 <YesutorQ> @El_SahdGustavo @Mhiestroburns ....𝘁𝗵𝗲𝗻 𝗶 𝗰𝗮𝗻 𝘀𝗲𝗲 𝗵𝗼𝘄 𝗱𝘂𝗺𝗽 𝘁𝗼𝘂 𝗹𝗼𝗼𝗸 𝗿𝘆𝘁 𝗻𝗼𝘄..Kep up the ignorance and pass it to your generation okay 😃😃😃 3b3 bua wo paaa.... prove point aa ,ɔsi awaiting judgement..   Stop exhibiting Symptoms of hunger and depre

It is important to see that the line of code that ensures the output to a `.csv` file is commented. If wish so, this line can be uncommented and the `.csv` saved right away. But the intent of this works is only to save the Tweets column and no other information. All the columns are shown below:

In [None]:
twint.output.panda.Tweets_df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
       'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

So it can be filtered only the `tweet` column to a pandas dataframe and exported to `.csv`. Also the library `whatthelang` is used to remove tweets that are not in english.

In [None]:
def detect_lang(text):
    try: 
        return wtl.predict_lang(text)
    except Exception:
        return 'exp'

wtl = WhatTheLang()

df = pd.DataFrame(twint.output.panda.Tweets_df['tweet'], columns=['tweet'])
df['lang'] = df['tweet'].map(lambda t: detect_lang(t)) # detect language for each tweet
df = df[df['lang'] == 'en'] # filter only english tweets
df = df.drop(columns=['lang'])

pd.set_option('display.max_colwidth', None)
df.head(3)

Unnamed: 0,tweet
0,"@DVPdirect It's like people minimizing depression with nonsense like ""Have you tried just being happy?"" and ""You've got great things going on, you shouldn't be sad."""
1,"@dahuggasystem @elonmusk And cancer, depression etc etc. With Covid or from Covid, what age? Shove it, we are done with this nonsense."
2,"@solarpunkgirl As someone who has fought this feeling going on 38 years now, I need you to know that this is your depression LYING TO YOU. It will do anything to win. You just have to recognize the lies for what they are."


Then applying for the entirety  of the opinion lexicon:

```{python}
def detect_lang(text):
    try: 
        return wtl.predict_lang(text)
    except Exception:
        return 'exp'
def twint_config():
  config = twint.Config()
  config.Pandas = True # use pandas integration
  config.Lang = 'en' # language
  config.Limit = 20 # config max. number of tweets returned (min. 20)
  config.Since = '2022-01-21 00:00:00' # fetch since data
  config.Until = '2022-01-28 00:00:00' # fetch until data
  config.Hide_output = True # hide output of tweets when running
  return config

keywords = pd.concat([positiveWords, negativeWords], axis=0).tolist() # keywords to iterate
df = pd.DataFrame([], columns=['tweet']) # base dataframe to save data
wtl = WhatTheLang() # what the language instance
config = twint_config() # twint configuration 
missed_keywords = [] # list of keywords that return error
n = len(keywords) # total number of keywords
count = 0 # counting of progress
save_ratio = 100 # save dataset in every #save_ratio scrapes

for keyword in keywords:
  try:
    config.Search = keyword # keyword to search
    twint.run.Search(config) # scape tweets with given keyword

    temp = pd.DataFrame(twint.output.panda.Tweets_df['tweet'], columns=['tweet']) # get dataframe of scraped data
    temp['lang'] = temp['tweet'].map(lambda t: detect_lang(t)) # detect language for each tweet
    temp = temp[temp['lang'] == 'en'] # filter only english tweets
    temp = temp.drop(columns=['lang']) # remove language column

    df = pd.concat([df,temp], axis=0) # concatenate new tweets with previous batch
    if((count%save_ratio)==(save_ratio-1)): 
      print("Progress: {:.0%}/100%".format(count/n))
      df.to_csv('dataset.csv')
  except: 
    print('Error: the index is %d, and the missed keyword is %s' % (count, keywords[count]))
    missed_keywords.append(keywords[count])
  count +=1
df.to_csv('dataset.csv'
files.download('dataset.csv')
```

This code snippet was executed in a host machine since Google Colab free quota does not support long time running. 

Some keywords did not returned tweets and other returned errors. Those were repeated two more times. Since the first run had a 1 week scope, the second run had 1 month scope and the last run had a 2 years scope. After this, the remaining keywords will not be repeated since after those 3 iterations there is enough to data to analyze.

After generating the data in a host machine, the final step is to load the dataset that was stored in the GitHub. The non-english tweets removal with the `WhatTheLang` was not made in this case, so it should be done now.

In [None]:
def detect_lang(text):
    try: 
        return wtl.predict_lang(text)
    except Exception:
        return 'exp'

df = pd.read_csv('https://raw.githubusercontent.com/Lwao/awesome-ai/main/ufrn-ai/datasets/tweets_dataset.csv', usecols=['tweet'])
print('Length of dataset BEFORE removing non-english tweets: ' + str(len(df.index))) 

wtl = WhatTheLang()
df['lang'] = df['tweet'].map(lambda t: detect_lang(t)) # detect language for each tweet
df = df[df['lang'] == 'en'] # filter only english tweets
df = df.drop(columns=['lang']) # drop language column
df = df.reset_index()
print('Length of dataset AFTER removing non-english tweets: ' + str(len(df.index)))

print(df.head())

Length of dataset BEFORE removing non-english tweets: 97869
Length of dataset AFTER removing non-english tweets: 87043
   index                                              tweet
0      1  @tvtalker1 For a long ass time...  https://t.c...
1      2  if you’ve never sobbed in your car while liste...
2      3  @frankamoctezuma @SenatorSoules If you had an ...
3      4  when you actually have hair , you don’t always...
4      5  @ShebayaSarai @SaunderHart @MaidQuitNoChef @Th...


Less than 10 thousand tweets were in a language different from english. The remaining dataset still large enough for good traning.

# Alternate data fetching

After initial tests, this works choose to base the data fetching in the keyword "depression" and its 43 synonyms trying to identify tweets related to mental health. The dataset is load below and removed tweets in non-english.

The code used in the hsot machine is:

```
import twint
import nest_asyncio
import pandas as pd

nest_asyncio.apply()

def twint_config():
  config = twint.Config()
  config.Pandas = True # use pandas integration
  config.Lang = 'en' # language
  config.Limit = 100 # config max. number of tweets returned (min. 20)
  config.Since = '2020-01-01 00:00:00' # fetch since data
  config.Until = '2022-01-01 00:00:00' # fetch until data
  
  config.Hide_output = True # hide output of tweets when running
  return config

keywords = ['depression','abasement','abjection','blahs','bleakness','bummer','cheerlessness','dejection','desolation','desperation','despondency','discouragement','dispiritedness','distress','dole','dolefulness','dolor','downheartedness','dreariness','dullness','dumps','ennui','gloom','gloominess','heavyheartedness','hopelessness','lowness','melancholia','melancholy','misery','mortification','qualm','sadness','sorrow','trouble','unhappiness','vapors','woefulness','worry','abjectness','blue' 'funk','disconsolation','heaviness of heart','lugubriosity']
config = twint_config() # twint configuration 
df = pd.DataFrame([], columns=['tweet']) # base dataframe to save data

for keyword in keywords:
  try:
    config.Search = keyword # keyword to search
    twint.run.Search(config) # scape tweets with given keyword
    temp = pd.DataFrame(twint.output.panda.Tweets_df['tweet'], columns=['tweet'])
    df = pd.concat([df,temp], axis=0) # concatenate new tweets with previous batch
  except: df.to_csv('security_backup.csv')
df.to_csv('tweet_depression_dataset_epanded.csv')
```

The data is taken from januray the 1th of 2020 to janury the 1th of 2022 parsing the period during the Covid-19 pandemic.

Again the data was fetch in a host machine and then save in the web. Duplicates are removed prior to language detection.

In [None]:
def detect_lang(text):
    try: 
        return wtl.predict_lang(text)
    except Exception:
        return 'exp'

df = pd.read_csv('https://raw.githubusercontent.com/Lwao/awesome-ai/main/ufrn-ai/datasets/tweets_drepression_dataset_expanded.csv', usecols=['tweet'])
print('Length of dataset BEFORE removing non-english tweets: ' + str(len(df.index))) 

wtl = WhatTheLang()
df['lang'] = df['tweet'].map(lambda t: detect_lang(t)) # detect language for each tweet
df = df[df['lang'] == 'en'] # filter only english tweets
df = df.drop(columns=['lang']) # drop language column
df = df.reset_index()
print('Length of dataset AFTER removing non-english tweets: ' + str(len(df.index)))

print(df.head())

Length of dataset BEFORE removing non-english tweets: 3654
Length of dataset AFTER removing non-english tweets: 3178
   index                                              tweet
0      0  @BarrennessBlack @Animalsdonthate @TwiceDonald...
1      2     I have depression this is the best Iâve got.
2      3  @cn0bles *sometimes* people who are hypersexua...
3      4  I wish I had depression, but I'm falling and t...
4      5     *depression cured fr*  https://t.co/9WB9KE8o24


In [None]:
df.to_csv('tweets_drepression_dataset_nonlabeled.csv')
files.download('tweets_drepression_dataset_nonlabeled.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Data labeling

Once the data is extracted based in the opinion lexicon, the following step is to apply data labeling to address meaningful and iformative sentiment analysis labels for each Tweet, thus providng context so that the machine learning models can learn from it. 

The tool of choice is the Valence Aware Dictionary and Sentiment Reasoner ([VADER](https://github.com/cjhutto/vaderSentiment)), a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. 

When provided a sentence VADER  is capable of addressing the rate of which represents a `positive`, `neutral` or `negative` sentiment in a scale that sums up to 1. There is an additional metric callend `compound` that varies from -1 up to 1 and i is basically an unidimensional measure of sentiment. Some useful thresholds of this metric are:

1. positive sentiment: `compound` $\geq 0.05$
2. neutral sentiment: `compound` $> -0.05$ and `compound` $< 0.05$
3. negative sentiment: `compound` $\leq -0.05$

The NLTK library also implements VADER, but this work choose to use the stand alone library for VADER.

There are also others [sentiments lexicons](https://www.tidytextmining.com/sentiment.html#comparing-the-three-sentiment-dictionaries) that can be used to generate the classification labels and are based in single words (unigrams), such as:

- [NRC](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) from Saif Mohammad and Peter Turney that categorizes words in a binary fashion likely a bag-of-words into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
- [BING](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) from Bing Liu and collaborators that categorizes words in a binary fashion into positive and negative categories, despite being simple, this was fitted the purpose to fetch the necessary data with the opinion lexicon;
- [AFINN](https://github.com/fnielsen/afinn/blob/master/afinn/data/AFINN-en-165.txt) that assigns words with a score that between -5 (most negative sentiment) and 5 (most positive sentiment).

In the context of this application both the NRC and BING are good choices to help data fetching and the BING was choose because its simplicity.

Regarding data labeling, both AFINN and VADER are good choices. The choice for VADER was defined because its higher complexity and the fact that is attuned for social media feed.




Below is shown an example using AFINN and since it computes the score based in the sum of each keyword weight, some undesirable situation can occur.

In [None]:
sentences = [r"I wish I can say I am happy", r"I've never been disappointed in my life"]
afinn = Afinn(language='en', emoticons=True)
for sentence in sentences: print("'"+sentence+"' score: "+ str(afinn.score(sentence)))

'I wish I can say I am happy' score: 4.0
'I've never been disappointed in my life' score: -2.0


The first sentence `I wish I can say I am happy` should be classified as `negative`, but since it has the name "happy", this contributes to a `positive` classification. The same way the second sentence `I've never been disappointed in my life` was misclassified as `positive`

Parsing the same sentences with VADER the result are shown below.

In [None]:
sentences = [r"I wish I can say I am happy", r"I've never been disappointed in my life"]
analyzer = SentimentIntensityAnalyzer()
for sentence in sentences: print("'"+sentence+"' score: "+ str(analyzer.polarity_scores(sentence)))

'I wish I can say I am happy' score: {'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.7506}
'I've never been disappointed in my life' score: {'neg': 0.0, 'neu': 0.701, 'pos': 0.299, 'compound': 0.3724}


Even the first sentence being `negative`, the lexicon did not show any percentage of this sentiment, but the other metric such as `neutral`, `positive` and `compound` presents a degree of freedom to work with such sentences. The second sentence pperformed better, with no `negative` rate, but not entirely `positive`, thus preferind a `neutral` approach.

Clearly those sentences are confusing and without a context they are useless. But the brief analysis have shown that VADER has more metrics, thus giving more degrees of freedom for labeling the dataset.

The `compound` metric will address the labels for the dataset that will be used in this work. Since it varies from -1 to +1, i.e. from extreme negative to extreme positive, the approach is to multiply the `compound` metric by 10 and round for the nearest integer, so allowing to have 21 different classes of sentiment classification with the following sample space $\text{label} = [-10,10] \; \forall l \in \mathbb{Z}$

Applying the VADER sentiment lexicon for the in usage dataset and adjusting the label range, the following is obtained:

In [None]:
analyzer = SentimentIntensityAnalyzer()
labels = []
for sentence in df['tweet'].tolist(): 
  labels.append(np.round(analyzer.polarity_scores(sentence)['compound']*10))
df['target'] = pd.Series(labels)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87043 entries, 0 to 87042
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   index   87043 non-null  int64  
 1   tweet   87043 non-null  object 
 2   target  87043 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.0+ MB


This step marks the end of the data labeling. This dataset is now ready for further cleaning, pre-processing and analysis. One more time, this milestone will be saved sothe next steps can progress from here and on.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Alternate data labeling

In this section the alternate dataset is loaded for labeling and then saved.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Lwao/awesome-ai/main/ufrn-ai/datasets/tweets_drepression_dataset_nonlabeled.csv', usecols=['tweet'])
analyzer = SentimentIntensityAnalyzer()
labels = []
for sentence in df['tweet'].tolist(): 
  labels.append(np.round(analyzer.polarity_scores(sentence)['compound']*10))
df['target'] = pd.Series(labels)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tweet   3178 non-null   object 
 1   target  3178 non-null   float64
dtypes: float64(1), object(1)
memory usage: 49.8+ KB


In [None]:
df.to_csv('tweets_drepression_dataset.csv')
files.download('tweets_drepression_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Data cleaning

From the previous milestone, the dataset can be loaded.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Lwao/awesome-ai/main/ufrn-ai/datasets/tweets_drepression_dataset.csv', usecols=['tweet','target'])
pd.set_option('display.max_colwidth', None)
df.sample(10).head()

Unnamed: 0,tweet,target
1387,"Guess what time it is for me? That's right, fatality sickening depression time. Ã°ÂÂÂÃ¢ÂÂ Ã¯Â¸Â",-9.0
2954,"âYeah, we are arriving on 23rd so we can unpack, make ourselves at home, decorate everythingâ¦â ák said pretending to be oblivious to the way ámâs eyes flickered with gloominess.",-3.0
1184,That day after Christmas sadness is reaaal,-4.0
2602,it's one step closer to 1995 tetsuya's hair. omg the day my hair is that length is the day my depression will disappear,-7.0
1357,"the world loves letting me know that it does not work for me, but actually works as a well-paid professional against me #depression #anxiety #ugh helpppppp .Ã¯Â½Â¡Ã¯Â½Â¥Ã¯Â¾ÂÃ¯Â¾ÂÃ¯Â½Â¥(Ã¯Â½Â¡Ã¢ÂÂ¢ÃÂÃ¯Â¸Â¿Ã¢ÂÂ¢ÃÂÃ¯Â½Â¡)Ã¯Â½Â¥Ã¯Â¾ÂÃ¯Â¾ÂÃ¯Â½Â¥Ã¯Â½Â¡.",-9.0


Next the NLTK library will be used for some cleaning in the textual data, such as:

- Remove unwanted characters and links;
- Remove stopwords that the sole purpose is to connect nouns and others meaningful words;
- Stemming to reduce words to its radical, i.e. short format;
- Lemmatization to reduce inflection words to a word that represents its motto;


Some resources from NLTK must be downloaded before applying the data cleaning.

In [None]:
nltk.download('stopwords')
nltk.download('rslp')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

Below a test sentence extracted from the dataset is used to test if each cleaning process is indeed useful or otherwise spoil the data.

In [None]:
# https://minerandodados.com.br/analise-de-sentimentos-utilizando-dados-do-twitter/

stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.RSLPStemmer()
lemmatizer = WordNetLemmatizer()

test_sentence = "@DeWalle80 @kingbooh_ @BarcaNaija @onovoxvii Pls snare my questions https://t.co/QdipCF4MCY"
print('Original test sentence: ' + test_sentence)

def RemoveUnwantedChars(instance): # remove links, dots, comas, etc.
  instance = re.sub(r"http\S+", "", instance).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','')
  return (instance)
test_sentence = RemoveUnwantedChars(test_sentence)
print('Remove unwanted chars result: ' + test_sentence)

def RemoveStopWords(instance, stopwords):
  words = [i for i in instance.split() if not i in stopwords]
  return (" ".join(words))
test_sentence = RemoveStopWords(test_sentence, stopwords)
print('Remove stopwords result: ' + test_sentence)

def Stemming(instance, stemmer):
  words = []
  for w in instance.split():
      words.append(stemmer.stem(w))
  return (" ".join(words))
test_sentence = Stemming(test_sentence, stemmer)
print('Stemming result: ' + test_sentence)

def Lemmatization(instance, lemmatizer):
  words = []
  for w in instance.split():
    words.append(lemmatizer.lemmatize(w))
  return (" ".join(words))
test_sentence = Lemmatization(test_sentence, lemmatizer)
print('Lemmatization result: ' + test_sentence)

Original test sentence: @DeWalle80 @kingbooh_ @BarcaNaija @onovoxvii Pls snare my questions https://t.co/QdipCF4MCY
Remove unwanted chars result: @dewalle80 @kingbooh_ @barcanaija @onovoxvii pls snare my questions 
Remove stopwords result: @dewalle80 @kingbooh_ @barcanaija @onovoxvii pls snare questions
Stemming result: @dewalle80 @kingbooh_ @barcanaij @onovoxvi pl sn questiom
Lemmatization result: @dewalle80 @kingbooh_ @barcanaij @onovoxvi pl sn questiom


As a final step, all those modifiers are applied to the entirety of the dataset in the following order:

- Remove unwanted characters;
- Remove stopwords;
- Stemming;
- Lemmatization;

In [None]:
def BatchPreprocessing(instance, stopwords=None, stemmer=None, lemmateizer=None):
  instance = RemoveUnwantedChars(instance)
  if(stopwords!=None): instance = RemoveStopWords(instance, stopwords)
  if(stemmer!=None): instance = Stemming(instance, stemmer)
  if(lemmatizer!=None): instance = Lemmatization(instance, lemmatizer)
  return instance

stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.RSLPStemmer()
lemmatizer = WordNetLemmatizer()

preprocessed_tweets = [BatchPreprocessing(tweet, stopwords, stemmer, lemmatizer) for tweet in df['tweet']]
df_preprocessed = df.copy()
df_preprocessed['tweet'] = pd.Series(preprocessed_tweets)
df_preprocessed.head()

Unnamed: 0,tweet,target
0,@barrennessblack @animalsdonthat @twicedonald @ksorb @twittersupport feel depression mil away i'm sorry hop find lov desperately need,1.0
1,depression best iâv got,1.0
2,@cn0bl *sometimes* peopl hypersex depression â¦ thatâ everyon tbh,-3.0
3,"wish depression, i'm falling end",-6.0
4,*depression cured fr*,-6.0


In [None]:
df.to_csv('tweets_depression_dataset_preprocessed.csv')
files.download('tweets_depression_dataset_preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Feature extraction

From the previous milestone, the dataset can be loaded.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/Lwao/awesome-ai/main/ufrn-ai/datasets/tweets_depression_dataset_preprocessed.csv', usecols=['tweet','target'])
pd.set_option('display.max_colwidth', None)
df.sample(10).head()

Unnamed: 0,tweet,target
842,@shehackspurple Oh yes a No-Op or NOP. I have seen a few used in malware over the years. IPS usually detects them after a few iterations but I have also seen different NOPs strung together with obfuscation to avoid detection. It seems like good fun for programming,9.0
915,@TylerFeldmanTV @Lin_Manuel just.....that man reeeeeally delights in threading the needle between 'delightful evocative art' and 'negligent infliction of emotional distress.',6.0
2328,"@lorenxco14 @btrflyclips your acting like their whole entire family was murdered and they are experiencing horrible depression and self crippling anxiety from it. they will be fine, they'll live. they'll get over it.",-9.0
3083,"good night, good night parting is such sweet sorrow, that I shall say good night till it be morrow &lt;3",8.0
1104,Where her medical team took any salvageable organs and declared her dead. She was 29. She had a 4 year old daughter. She battled addictions and homelessness for years.,-8.0


This work will use multiple [text vectorization/feature extraction](https://towardsdatascience.com/getting-started-with-text-vectorization-2f2efbec6685) methods, they are:

- Binary Term Frequency (One-Hot-Encoder);
- Bag of Words (BoW) Term Frequency;
- (L1) Normalized Term Frequency;
- (L2) Normalized TF-IDF;
- Word2Vec.

The first four can be implemented using TF-IDF from `Sklearn` library with some adjusts and the last will be used with the help of the `Gensim` library.

1. The One-Hot-Encoder is achieved seting the `TfidfVectorizer` binary parameter to true and norm to false;
2. To obtain the Bag-of-Word the binary parameter in changed to false so it can show the term frequency and norm to none;
3. To obtain the TF the binary parameter still false, but the norm changes to L1;
4. To obtain the TF-IDF the binary parameter still fasle, but the norm is changed to L2 and the parameters of using IDF and smooth IDF are change to true.

In [None]:
def get_text_vectorizer(model='ohe'):
  if(model=='ohe'):
    tv = TfidfVectorizer(
        binary=True, norm=False, use_idf=False, smooth_idf=False,
        lowercase=True, stop_words='english', min_df=1, max_df=1.0,
        ngram_range=(1,1), max_features=1000
    )
  elif(model=='bow'):
    tv = TfidfVectorizer(
        binary=False, norm=None, use_idf=False, smooth_idf=False,
        lowercase=True, stop_words='english', min_df=1, max_df=1.0,
        ngram_range=(1,1), max_features=1000
    )
  elif(model=='tf'):
    tv = TfidfVectorizer(
        binary=False, norm='l1', use_idf=False, smooth_idf=False,
        lowercase=True, stop_words='english', min_df=1, max_df=1.0,
        ngram_range=(1,1), max_features=1000
    )
  elif(model=='tf-idf'):
    tv = TfidfVectorizer(
        binary=False, norm='l2', use_idf=True, smooth_idf=True,
        lowercase=True, stop_words='english', min_df=1, max_df=1.0,
        ngram_range=(1,1), max_features=1000
    )
  return tv

Considering the `Word2Vec` implementation, it was used with vectors of size 1000 for each word in the dataset. Following the trainined `Word2Vec` model on top of the dataset vocabulary was iterated with every word en each sentence of the dataset. To get the vector representation of each sentence, it was chosen the strategy of getting the mean value of all vector of each word in the sentence.

In [138]:
# Create Word2Vec (https://gist.github.com/giuseppebonaccorso/061fca8d0dfc6873619efd8f364bfe89, https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)
tokenized_data = [str_.split() for str_ in df['tweet'].tolist()] # tokenize data
model = Word2Vec(sentences=tokenized_data, size=1000, min_count=1, workers=multiprocessing.cpu_count()) # train word2vec model
print('Word2Vec model summary:')
print(model)

# apply mean value of vector in each word of a given sentence for all tweets
m = len(tokenized_data)
matrix_ = np.zeros((m,1000))
for i in range(m):
  n = len(tokenized_data[i][:])
  for j in range(n):
    matrix_[i,:] += (model.wv[tokenized_data[i][j]]/n)

Word2Vec model summary:
Word2Vec(vocab=20114, size=1000, alpha=0.025)


Using a little toy set of data all text vectorizers can be tested before applying to the entire dataset. `Word2Vec` was ot tested since its content depends on the data.

In [None]:
test_sentences = ['dogs bark','cats meow','dogs bark, cats meow','dogs bark, cats dont bark','cats meow, dogs dont meow']
print('Sample of One-Hot-Encoder:')
tv = get_text_vectorizer(model='ohe')
print(pd.DataFrame(tv.fit_transform(test_sentences).toarray(), columns=tv.get_feature_names_out()))
print('Sample of Bag-of-Words:')
tv = get_text_vectorizer(model='bow')
print(pd.DataFrame(tv.fit_transform(test_sentences).toarray(), columns=tv.get_feature_names_out()))
print('Sample of TF:')
tv = get_text_vectorizer(model='tf')
print(pd.DataFrame(tv.fit_transform(test_sentences).toarray(), columns=tv.get_feature_names_out()))
print('Sample of TF-IDF:')
tv = get_text_vectorizer(model='tf-idf')
print(pd.DataFrame(tv.fit_transform(test_sentences).toarray(), columns=tv.get_feature_names_out()))

Sample of One-Hot-Encoder:
   bark  cats  dogs  dont  meow
0   1.0   0.0   1.0   0.0   0.0
1   0.0   1.0   0.0   0.0   1.0
2   1.0   1.0   1.0   0.0   1.0
3   1.0   1.0   1.0   1.0   0.0
4   0.0   1.0   1.0   1.0   1.0
Sample of Bag-of-Words:
   bark  cats  dogs  dont  meow
0   1.0   0.0   1.0   0.0   0.0
1   0.0   1.0   0.0   0.0   1.0
2   1.0   1.0   1.0   0.0   1.0
3   2.0   1.0   1.0   1.0   0.0
4   0.0   1.0   1.0   1.0   2.0
Sample of TF:
   bark  cats  dogs  dont  meow
0  0.50  0.00  0.50   0.0  0.00
1  0.00  0.50  0.00   0.0  0.50
2  0.25  0.25  0.25   0.0  0.25
3  0.40  0.20  0.20   0.2  0.00
4  0.00  0.20  0.20   0.2  0.40
Sample of TF-IDF:
       bark      cats      dogs     dont      meow
0  0.765241  0.000000  0.643744  0.00000  0.000000
1  0.000000  0.643744  0.000000  0.00000  0.765241
2  0.541107  0.455196  0.455196  0.00000  0.541107
3  0.763236  0.321029  0.321029  0.45973  0.000000
4  0.000000  0.321029  0.321029  0.45973  0.763236


It is easy to spot the difference between each vectorizer.

Each vectorizer will be applied to the dataset and the results will be stored in a dictionary of dataframes for further training.

In [144]:
df_dict = {'ohe':pd.DataFrame(), 'bow':pd.DataFrame(), 'tf':pd.DataFrame(), 'tf-idf':pd.DataFrame(), 'word2vec':pd.DataFrame()}
keys = list(df_dict.keys())
for key in keys[:-1]:
  tv = get_text_vectorizer(model=key)
  df_dict[key] = pd.DataFrame(tv.fit_transform(df['tweet']).toarray())
df_dict['word2vec'] = pd.DataFrame(matrix_)

# Machine learning training

This is the final section once all machine learning models will be applied and the results analyzed.

The tests will use 4 different ML models:

- KNN;
- Multinomial Naive Bayes ([this subset of naive Bayes is more adequate for textual data](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB));
- Decision tree;
- Random forest;

Each model will run with a grid search in a default set of hyperparameters. The features will be scaled with a min-max scaler. Each model will run for each text vectorizer method and the different vectorizers will be compared inside the same ML model.

In [None]:
def run(data_dict, y, keys, train_per=0.9, clf='knn'):
  # Build search space for the model 
  steps = []
  steps.append(('scaler', MinMaxScaler()))
  
  if(clf=='knn'):
    steps.append(('clf', KNeighborsClassifier(algorithm='auto', n_jobs=-1, leaf_size=100)))
    param_grid = {
      'clf__n_neighbors': [1, 3, 5, 7],
      'clf__weights': ['uniform', 'distance'], 
      'clf__metric': ['euclidean', 'minkowski']
    }
  elif(clf=='naive_bayes'):
    steps.append(('clf', MultinomialNB()))
    param_grid = {
      'clf__alpha': [1e-3, 1e-2, 1e-1, 1e0]
    }
  elif(clf=='decision_tree'):
    steps.append(('clf', DecisionTreeClassifier()))
    param_grid = {
      'clf__criterion': ['gini', 'entropy'],
      'clf__max_depth': [None, 5, 10, 15],
    }
  elif(clf=='random_forest'): 
    steps.append(('clf', RandomForestClassifier()))  
    param_grid = {
      'clf__n_estimators': [10, 50, 100, 200, 300],
      'clf__criterion': ['gini', 'entropy'],
      'clf__max_depth': [None, 5, 10, 15]
    }

  y = y.ravel()
  for key in keys:
    X = data_dict[key]
    # Holdout 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_per, random_state=42, shuffle=True)

    # Grid search
    start = time.process_time()
    pipe = Pipeline(steps)
    model = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=False)
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Grid search duration: ' + str(time.process_time()-start) + ' seconds')

    # Metrics
    print('Model metrics:')
    print('='*30)
    print('Vectorizer: ' + key)
    print('Best hyperparameters: ' + str(model.best_params_))
    print('Best score: ' + str(model.score(X_test, y_test)))
    print('\n')

Considering all vectorizer options:

In [147]:
print(keys)

['ohe', 'bow', 'tf', 'tf-idf', 'word2vec']


Training for KNN:

In [148]:
run(df_dict, df['target'], keys, train_per=0.9, clf='knn')

Grid search duration: 1.2705906329999834 seconds
Model metrics:
Vectorizer: ohe
Best hyperparameters: {'clf__metric': 'euclidean', 'clf__n_neighbors': 3, 'clf__weights': 'distance'}
Best score: 0.27672955974842767


Grid search duration: 1.0486953299999868 seconds
Model metrics:
Vectorizer: bow
Best hyperparameters: {'clf__metric': 'euclidean', 'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Best score: 0.2578616352201258


Grid search duration: 1.1731510629999775 seconds
Model metrics:
Vectorizer: tf
Best hyperparameters: {'clf__metric': 'euclidean', 'clf__n_neighbors': 1, 'clf__weights': 'uniform'}
Best score: 0.18867924528301888


Grid search duration: 1.0328542510000034 seconds
Model metrics:
Vectorizer: tf-idf
Best hyperparameters: {'clf__metric': 'euclidean', 'clf__n_neighbors': 3, 'clf__weights': 'distance'}
Best score: 0.2169811320754717


Grid search duration: 1.107474959000001 seconds
Model metrics:
Vectorizer: word2vec
Best hyperparameters: {'clf__metric': 'euclidean', 'c

Training for naive Bayes:

In [149]:
run(df_dict, df['target'], keys, train_per=0.9, clf='naive_bayes')

Grid search duration: 0.29030486199997085 seconds
Model metrics:
Vectorizer: ohe
Best hyperparameters: {'clf__alpha': 1.0}
Best score: 0.24842767295597484


Grid search duration: 0.3510722580000447 seconds
Model metrics:
Vectorizer: bow
Best hyperparameters: {'clf__alpha': 1.0}
Best score: 0.24213836477987422


Grid search duration: 0.33870970500004205 seconds
Model metrics:
Vectorizer: tf
Best hyperparameters: {'clf__alpha': 0.1}
Best score: 0.2389937106918239


Grid search duration: 0.3227986189999683 seconds
Model metrics:
Vectorizer: tf-idf
Best hyperparameters: {'clf__alpha': 1.0}
Best score: 0.22327044025157233


Grid search duration: 0.3440397370000028 seconds
Model metrics:
Vectorizer: word2vec
Best hyperparameters: {'clf__alpha': 0.001}
Best score: 0.14779874213836477




Training for decision tree:

In [150]:
run(df_dict, df['target'], keys, train_per=0.9, clf='decision_tree')

Grid search duration: 1.0542255029999978 seconds
Model metrics:
Vectorizer: ohe
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_depth': None}
Best score: 0.27358490566037735


Grid search duration: 0.6858728889999384 seconds
Model metrics:
Vectorizer: bow
Best hyperparameters: {'clf__criterion': 'entropy', 'clf__max_depth': 15}
Best score: 0.25157232704402516


Grid search duration: 0.7721577710000247 seconds
Model metrics:
Vectorizer: tf
Best hyperparameters: {'clf__criterion': 'entropy', 'clf__max_depth': 15}
Best score: 0.22641509433962265


Grid search duration: 0.6949863600000299 seconds
Model metrics:
Vectorizer: tf-idf
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_depth': 15}
Best score: 0.23270440251572327


Grid search duration: 4.917319666000026 seconds
Model metrics:
Vectorizer: word2vec
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_depth': 5}
Best score: 0.16037735849056603




Training for random forest:

In [151]:
run(df_dict, df['target'], keys, train_per=0.9, clf='random_forest')

Grid search duration: 4.5750468610000325 seconds
Model metrics:
Vectorizer: ohe
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__n_estimators': 50}
Best score: 0.27672955974842767


Grid search duration: 12.26889772000004 seconds
Model metrics:
Vectorizer: bow
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_depth': None, 'clf__n_estimators': 300}
Best score: 0.2893081761006289


Grid search duration: 12.64024860699999 seconds
Model metrics:
Vectorizer: tf
Best hyperparameters: {'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__n_estimators': 300}
Best score: 0.3113207547169811


Grid search duration: 12.95100331599997 seconds
Model metrics:
Vectorizer: tf-idf
Best hyperparameters: {'clf__criterion': 'entropy', 'clf__max_depth': None, 'clf__n_estimators': 300}
Best score: 0.29559748427672955


Grid search duration: 46.36864012900003 seconds
Model metrics:
Vectorizer: word2vec
Best hyperparameters: {'clf__criterion': 'gini', 'clf__max_dep

Observing the score results for each vectorizer in each ML model, it is clearly to see that there is no right order for the best stituation. Despite presenting low score results overall, the dataset has few peculiarities. Perhaps the data scraping should be better, or better data cleaning, etc.

Looking for the actual results, the Word2Vec vectorizer had the lowest results for this dataset. Perhaps using the mean value of vector for each word in sentence was not good enough. In other way, the One-Hot-Encoder was the more adequate for the dataset of choice in most of the cases. For the random forest model the best vectorizer was the TF.

Further imporvements may come with some research and experimentation, but the present work displayed a design flow methodology from gathering, cleaning and labeling the data, also performing feature extraction with multiples vectorizer and using multiples machine learning models to train.