<a href="https://colab.research.google.com/github/LeMikey/FYP-2023/blob/main/FYP_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
import torchtext
from torchtext.datasets import IMDB
from torchtext.data.functional import to_map_style_dataset, sentencepiece_tokenizer
from torch.utils.data import DataLoader, TensorDataset
import nltk
from nltk.corpus import wordnet, stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Filter only English tweets
# Convert all columns to string type
# Drop all rows with any NaN and NaT values
UnC_Data = pd.read_csv('./Dataset/Text/4/dataset.csv', low_memory=False).query('Language == "en"').astype(str).dropna()

# Only keep the text and label columns
UnC_Data = UnC_Data[['Text', 'Label']]

# Rename the "Label" column as "sentiment labels"
UnC_Data = UnC_Data.rename(columns={'Label': 'Sentiment_Labels'})

# Visualize the distribution of the labels
UnC_Data.groupby(['Sentiment_Labels']).size().plot.bar()

In [None]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Function to remove emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                       u"\U0001F600-\U0001F64F"
                       u"\U0001F300-\U0001F5FF"
                       u"\U0001F680-\U0001F6FF"
                       u"\U0001F1E0-\U0001F1FF"
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Function to remove unwanted characters and symbols
def clean_tweet(tweet):
    if isinstance(tweet, float):
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp)
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#","", temp)
    temp = remove_emoji(temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    return temp

# Function to preprocess the dataset with tokenization, lemmatization, and stop words removal using spaCy
def preprocess_tweet_spacy(tweet):
    doc = nlp(tweet)
    filtered_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS]
    return filtered_tokens

# Apply the cleaning function to the 'Text' column using vectorized operations
UnC_Data['Text'] = UnC_Data['Text'].apply(clean_tweet)

# Use spaCy for tokenization, lemmatization, and stop words removal
UnC_Data['Text'] = UnC_Data['Text'].apply(lambda x: preprocess_tweet_spacy(x))