In [88]:
# imports, installation of flair required
import pandas as pd
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, FlairEmbeddings
from flair.data import Sentence
import numpy as np
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import warnings
import emoji

from flair.data import Sentence
from flair.embeddings import WordEmbeddings

import spacy 
eng_lemmatizer = spacy.load('en_core_web_sm')

import re # regex

#To save data in pkl file
import pickle as pkl


import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re, string

fasttext = WordEmbeddings('en')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kristi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Preprocessing</h3>

In [89]:
data_test = pd.read_csv("./data/twitter_sentiment_data.csv")

def importpreprocessing(data):
    data.drop(columns="tweetid")
    data.rename(columns={'sentiment':'label','message':'text'},inplace=True)

    return data

<h4>Clean-up Text</h4>

- Delete "RT"
- Delete HTTPS/HTTP Links
- Delete Double Whitespaces
- Delete @twitteruser

In [98]:
def clean_text(text):

    RE_TWITTERUSERS = re.compile(r'@\S+', re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_HTTPS = re.compile(r'https\S+', re.IGNORECASE)
    RE_HTTP = re.compile(r'http\S+', re.IGNORECASE)
    RE_RT = re.compile(r'RT', re.IGNORECASE)

    text = re.sub(RE_TWITTERUSERS, " ", text)
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    text = re.sub(RE_HTTPS, " ", text)
    text = re.sub(RE_HTTP, " ", text)
    text = re.sub(RE_RT," ",text)
    text = re.sub(RE_WSPACE, " ", text)

    return text

# Replace emojis with sentiment words
def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

    

Tokinize and Lemmatizise Text

In [99]:
def lemma(data):
    # generate new column with stopword free and lemmatizised text
    data["shortData"] = data['cleanData'].apply(lambda x:tokenize(x))
    return data

In [100]:
def tokenize(text):

    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    # get the stopwords for the english language
    my_stopwords = set(stopwords.words('english'))
    lemmas = []
    tokens = token_pattern.findall(text)
    for item in tokens:
        if item not in my_stopwords:
            # all stems
            doc = eng_lemmatizer(item)
            for word in doc:
                lemmas.append(word.lemma_)
    return lemmas

Raname columns and convert label -1 to 3 for conversion into labelencoder

In [101]:
def minor_changes(data,name):
    data.rename(columns={'shortData':'tokens'},inplace=True)

    #replace -1 with 3 
    data['label'] = data['label'].replace([-1],[3])

    data.to_csv(f'preprocessed_{name}.csv')

    return data

Methods for creating fasttext pkl

In [102]:
def get_list(doc):
    if(isinstance(doc, str)):
        help = doc[1:len(doc)-1]
        help = help.replace('\'', '')
        as_list = help.split(',')
        return as_list
    else:
        return doc

def makefasttextpkl(data,name):
    #drop all rows that we dont need
    data.drop(columns=['Unnamed: 0','text','cleanData'])
    vector_data = []

    for t in data['tokens']:
        word_list = get_list(t)
        to_embed = Sentence(" ".join(word_list))
        fasttext.embed(to_embed)
        vector = np.zeros((1, 300))
        emb_list = []
        for token in to_embed:
            #print(token, token.embedding)
            emb_list.append(token.embedding)
        for v in emb_list:
            v = v.detach().cpu().numpy()
            vector += v
        vector = vector/len(emb_list)
        vector_data.append(vector)

    print(len(vector_data))

    data['fasttext'] = vector_data
    data.to_pickle(f'fasttext_preprocessed_{name}.pkl')

    return data

In [107]:
def wrapper(train, test):
    # Preprocess train and test data
    train = importpreprocessing(train)
    test = importpreprocessing(test)

    # Clean text data
    train["cleanData"] = train["text"].map(lambda x: clean_text(x) if isinstance(x, str) else x)
    test["cleanData"] = test["text"].map(lambda x: clean_text(x) if isinstance(x, str) else x)

    # Apply lemmatization
    train = lemma(train)
    test = lemma(test)

    # Apply minor changes
    train = minor_changes(train, "train")
    test = minor_changes(test, "test")

    # Create FastText pickle files with the required 'name' argument
    train = makefasttextpkl(train, "train")
    test = makefasttextpkl(test, "test")

    return train, test

In [None]:
train = pd.read_csv("./data/trainset.csv")
test = pd.read_csv("./data/testset.csv")

wrapper(train,test)
