In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import re
import ssl

In [13]:
# Source: https://github.com/gunthercox/ChatterBot/issues/930#issuecomment-322111087 
# This is to fix the SSL error when downloading nltk data, which is a known issue on Mac OS
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaeld./nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/michaeld./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michaeld./nltk_data...


True

In [3]:
# Function for text preprocessing
def preprocess_text(text):
    print(text)

    # Remove digits
    text = text.replace('\d+', '')
    
    # Clean urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text)
    
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords and non-alphanumeric characters
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [8]:
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', (word))
        new_words.append(new_word)
    return new_words

In [18]:
train_df = pd.read_csv("data_train_preprocessed.csv")
test_df = pd.read_csv("data_test_preprocessed.csv")

train_df['processed_text_alt'] = train_df['processed_text_alt'].fillna('')
test_df['processed_text_alt'] = test_df['processed_text_alt'].fillna('')

print(train_df['processed_text_alt'].head())

0    bernie elizabeth issue matter issue make danke...
1    extending brexit deadline october 31st order e...
2    kwai gkwa 0964 nnevvy applause thai hong kong ...
3    order foce mask protect ogainst fhe corond vir...
4    best candidate ja 2020 joe biden kamala harris...
Name: processed_text_alt, dtype: object


In [None]:
train_tokens = [token for tokens_list in train_df['processed_text_alt'] for token in tokens_list]
preprocessed_train_tokens = [token for tokens_list in train_df['processed_text_alt'] for token in tokens_list]

vocabulary_size = len(set(train_tokens))
vocabulary_size_preprocessed = len(set(preprocessed_train_tokens))

print(f'Vocabulary Size Preprocessed: {vocabulary_size_preprocessed}')
print(f'Vocabulary Size: {vocabulary_size}')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)