## Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
import inflect
import contractions
from bs4 import BeautifulSoup
import re, string, unicodedata
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from collections import defaultdict
import matplotlib.pyplot as plt

## Reading the data

In [2]:
df = pd.read_csv('tweet_emotions.csv', delimiter=',')
df.sentiment.value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

### Reducing Number of classes from 13 to 2 classes

In [3]:
cat = []
def class_change(df):
    for x in df.sentiment:
        if x in ['happiness', 'relief', 'love', 'surprise', 'fun', 'enthusiasm','empty']:
            cat.append('happiness')
        else :
            cat.append('sadness')
            
class_change(df)
df['sentiment'] = cat
print('After Reducing Classes to 3: \n',df.sentiment.value_counts())

After Reducing Classes to 3: 
 sadness      23874
happiness    16126
Name: sentiment, dtype: int64


# ----------- User defined function for preprocessing "content" column --------------

In [4]:
def text_preprocessing(df, text_col, remove_stopwords=True):
    
    # -----------------------------------------Function to denoise text------------------------------------------
    def denoise_text(text):
        # Strip html if any. For ex. removing <html>, <p> tags
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text()
        # Contractions replacement in the text. (For ex. didn't -> did not)
        text = contractions.fix(text)
        return text
    
    #---------------------------------------------Text Normalization --------------------------------------------    
    
    def remove_non_ascii(words):
        """Remove non-ASCII characters"""
        new_words = []
        for word in words:
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_words.append(new_word)
        return new_words
    
    
    def to_lowercase(words):
        """Convert all characters to lowercase"""
        new_words = []
        for word in words:
            new_word = word.lower()
            new_words.append(new_word)
        return new_words
    
    
    def remove_punctuation(words):
        """Remove punctuation"""
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words
    
    
    def replace_numbers(words):
        """Replace all interger occurrences with textual representation"""
        p = inflect.engine()
        new_words = []
        for word in words:
            if word.isdigit():
                new_word = p.number_to_words(word)
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words
    
    
    def remove_stopwords(words):
        """Remove stop words"""
        new_words = []
        for word in words:
            if word not in stopwords.words('english'):
                new_words.append(word)
        return new_words
    
    
    def stem_words(words):
        """Stem words"""
        stemmer = LancasterStemmer()
        stems = []
        for word in words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems
    
    
    def lemmatize_verbs(words):
        """Lemmatize verbs"""
        lemmatizer = WordNetLemmatizer()
        lemmas = []
        for word in words:
            lemma = lemmatizer.lemmatize(word, pos='v')
            lemmas.append(lemma)
        return lemmas
    
    
    #-------------------------------------------A wrap-up function for normalization------------------------------
    def normalize_text(words, remove_stopwords):
        words = remove_non_ascii(words)
        words = to_lowercase(words)
        words = remove_punctuation(words)
        words = replace_numbers(words)
        if remove_stopwords:
            words = remove_stopwords(words)
        #words = stem_words(words)
        words = lemmatize_verbs(words)
        return words
    
    # All above functions work on word tokens we need a tokenizer
    
    #-------------------------------------- Tokenize tweet into words ----------------------------
    def tokenize(text):
        return nltk.word_tokenize(text)
    
    
    # A overall wrap-up function
    def text_prepare(text):
        text = denoise_text(text)
        text = ' '.join([x for x in normalize_text(tokenize(text), remove_stopwords)])
        return text
    
    # run every-step
    df[text_col] = [text_prepare(x) for x in df[text_col]]
    
    
    # --------------------------------------return processed df----------------------------------
    return df

### Comparison of data before and after preprocessing

In [5]:
print("Before Text Preprocessing")
display(df.head()[['content']])
processed_df = text_preprocessing(df, 'content', remove_stopwords=False)
print("After Text Preprocessing")
display(processed_df.head()[['content']])

Before Text Preprocessing


Unnamed: 0,content
0,@tiffanylue i know i was listenin to bad habi...
1,Layin n bed with a headache ughhhh...waitin o...
2,Funeral ceremony...gloomy friday...
3,wants to hang out with friends SOON!
4,@dannycastillo We want to trade with someone w...


After Text Preprocessing


Unnamed: 0,content
0,tiffanylue know listenin bad habit earlier sta...
1,layin n bed headache ughhhh waitin call
2,funeral ceremony gloomy friday
3,want hang friends soon
4,dannycastillo want trade someone houston ticke...


#### Dimensionality reduction - Removing 'tweet_id' column and dropping Missing values

In [6]:
new = processed_df.drop(columns='tweet_id')
print('Shape of data before Preprocessing : ',df.shape)
print('Shape of data after Preprocessing and dimensionality reduction : ',new.shape)
new.to_excel('Emotion-Detection-preprocessing.xlsx')

Shape of data before Preprocessing :  (40000, 3)
Shape of data after Preprocessing and dimensionality reduction :  (40000, 2)


## Information :
    1. Reduced from 13 classes to 2 classes as follows:
        'happiness' : ['happiness', 'relief', 'love', 'surprise', 'fun', 'enthusiasm','empty']
        'sadness' : ['sadness','worry','neutral','hate','anger','boredom']
         
    2. Data Preprocessing:
        > Denoise text
        > Remove non-ASCII words
        > Converted all letters to lowercase
        > Removed Punctuations
        > Replaced Numbers to text
        > Removed stopwords
        > Normalized the text
        
    3. Dimensionality Reduction:
        Dropped "tweet_id" column from preprocessed data, as it is unique and doesn't influence the "sentiment" column.