## Data and Library Imports

In [41]:
import pandas as pd

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import spacy
from textblob import TextBlob
from textblob import Word

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import spacy.cli 
spacy.cli.download("en_core_web_md")

In [2]:
text_df = pd.read_csv('../Data/raw_data/sentiment_tweets3.csv')

## Data Exploration

In [3]:
text_df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [4]:
text_df = text_df.rename(columns={'message to examine': 'message', 'label (depression result)': 'label_d'})

display(text_df.head())
display(text_df.shape)

Unnamed: 0,Index,message,label_d
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


(10314, 3)

In [5]:
display(text_df['label_d'].value_counts())

0    8000
1    2314
Name: label_d, dtype: int64

**NOTE** Does the person have depression? 0 stands for NO and 1 stands for YES

## Text Cleaning

**Process**: replacing new lines with blank spaces -> remove consecutive blanks -> Converting emojis and emoticons -> remove url -> remove HTML tags -> remove symbols(punctuation and numbers) -> remove stop words -> remove common words -> spelling correction -> lower case and leading/trailing whitespace -> lemmentization -> tokenization -> *BOW or TF-IDF* -> *Sentiment Analysis*

In [6]:
# Libraries for Text Cleaning 
import re
import nltk

from emot.emo_unicode import UNICODE_EMO, EMOTICONS
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from collections import Counter

from textblob import TextBlob

In [7]:
temp_df = text_df.copy()
temp_df.head()

Unnamed: 0,Index,message,label_d
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [31]:
# Functions Defined 
# 2
mention_re = re.compile("@\w+")
def remove_mention(text):
    return mention_re.sub(repl=" ", string=text)

# 3
# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text

# 4
def remove_urls(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text)

# 5
def remove_html(text):
    return BeautifulSoup(text, "html.parser").text

# 6
# removing symbols
symb_re = re.compile(r"""[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~،؟…«“\":\"…”]""")
def remove_symbols(text: str) -> str:
    return symb_re.sub(repl="", string=text)
# removing numbers
numbers_re = re.compile("\d+")
def remove_numbers(text):
    # TODO: Implement remove numbers
    return numbers_re.sub(repl="", string=text)

# 7
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# 8 
cnt = Counter()
for text in temp_df["message"].values:
    for word in text.split():
        cnt[word] += 1
# Removing the frequent words
freq = set([w for (w, wc) in cnt.most_common(150)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not 
in freq])

# 9 
rx = re.compile(r'([^\W\d_])\1{2,}')
def remove_repeating_characters(text: str):
    return re.sub(r'[^\W\d_]+', lambda x: Word(rx.sub(r'\1\1', x.group())).correct() \
                  if rx.search(x.group()) else x.group(), text)
# 10
multiple_space_re = re.compile("\s{2,}")
def remove_multiple_whitespace(text):
    return multiple_space_re.sub(repl=" ", string=text)

In [32]:
def text_cleaning(txt):
    txt = txt.replace('\n', ' ')
    txt = remove_mention(txt)
    txt = convert_emojis(txt)
    txt = convert_emoticons(txt)
    txt = remove_urls(txt)
    txt = remove_html(txt)
    txt = remove_symbols(txt)
    txt = remove_numbers(txt)   # might be removed as it can result in loss of information for the textual data
    txt = remove_stopwords(txt)
    txt = remove_freqwords(txt)
    txt = str(TextBlob(txt).correct())
    txt = txt.lower().strip()
    txt = remove_repeating_characters(txt)
    txt = remove_multiple_whitespace(txt)
    txt = txt.replace('[^\w]',' ')
    return txt

In [33]:
temp_df['message_clean'] = temp_df['message'].apply(lambda x: text_cleaning(x))

In [34]:
temp_df.head()

Unnamed: 0,Index,message,label_d,message_clean,message_clean_norm,message_vect,message_clean_processed
0,106,just had a real good moment. i missssssssss hi...,0,real moment miss,real moment miss,"[real, moment, miss]",real moment miss
1,217,is reading manga http://plurk.com/p/mzp1e,0,reading mania,reading mania,"[reading, mania]",reading mania
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0,,comeagainjen,[comeagainjen],comeagainjen
3,288,@lapcat Need to send 'em to my accountant tomo...,0,need send em accountant tomorrow oddly want re...,lascar need send em accountant tomorrow oddly ...,"[lascar, need, send, em, accountant, tomorrow,...",lascar need send em accountant tomorrow oddly ...
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,add of of myspace myspacecomlookthunder,add of of myspace myspacecomlookthunder,"[add, of, of, myspace, myspacecomlookthunder]",add of of myspace myspacecomlookthunder


## Further Text Preprocessing (Word Normalization, Vectorization, Feature Extraction)

In [35]:
# Word Normalization with Lemmatizer
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} # Pos tag, used Noun, Verb, Adjective and Adverb
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

temp_df['message_clean_norm'] = temp_df["message_clean"].apply(lemmatize_words)

In [36]:
# Vectorization: Word Vectorizer
temp_df['message_vect'] = temp_df['message_clean_norm'].apply(lambda x: nltk.word_tokenize(x))
temp_df.head()

Unnamed: 0,Index,message,label_d,message_clean,message_clean_norm,message_vect,message_clean_processed
0,106,just had a real good moment. i missssssssss hi...,0,real moment miss,real moment miss,"[real, moment, miss]",real moment miss
1,217,is reading manga http://plurk.com/p/mzp1e,0,reading mania,reading mania,"[reading, mania]",reading mania
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0,,,[],comeagainjen
3,288,@lapcat Need to send 'em to my accountant tomo...,0,need send em accountant tomorrow oddly want re...,need send em accountant tomorrow oddly want re...,"[need, send, em, accountant, tomorrow, oddly, ...",lascar need send em accountant tomorrow oddly ...
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,add of of myspace myspacecomlookthunder,add of of myspace myspacecomlookthunder,"[add, of, of, myspace, myspacecomlookthunder]",add of of myspace myspacecomlookthunder


In [37]:
temp_df['message_clean_processed'] = temp_df['message_vect'].apply(lambda x: ' '.join(x))
temp_df.head()

Unnamed: 0,Index,message,label_d,message_clean,message_clean_norm,message_vect,message_clean_processed
0,106,just had a real good moment. i missssssssss hi...,0,real moment miss,real moment miss,"[real, moment, miss]",real moment miss
1,217,is reading manga http://plurk.com/p/mzp1e,0,reading mania,reading mania,"[reading, mania]",reading mania
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0,,,[],
3,288,@lapcat Need to send 'em to my accountant tomo...,0,need send em accountant tomorrow oddly want re...,need send em accountant tomorrow oddly want re...,"[need, send, em, accountant, tomorrow, oddly, ...",need send em accountant tomorrow oddly want re...
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,add of of myspace myspacecomlookthunder,add of of myspace myspacecomlookthunder,"[add, of, of, myspace, myspacecomlookthunder]",add of of myspace myspacecomlookthunder


In [38]:
# Feature Extraction 1 (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(temp_df['message_clean_processed'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf_df = pd.DataFrame(values.toarray(), columns = tfidf_feature_names)

# Feature Extraction 2 (BOW)
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(temp_df['message_clean_processed'])
BOW_feature_names = count_vectorizer.get_feature_names()
BOW_df = pd.DataFrame(bag_of_words.toarray(), columns = BOW_feature_names)

tfidf_df['label_d'] = text_df['label_d']
BOW_df['label_d'] = text_df['label_d']



In [39]:
tfidf_df.to_csv('../Data/processed/tfidf_tweets.csv', index=False)
BOW_df.to_csv('../Data/processed/BOW_tweets.csv', index=False)

**references**
- https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d