In [18]:
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob

In [19]:
# download the necessary resources
nltk.download('stopwords')

# set up the necessary resources
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dylan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# clean the text
def clean_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [21]:
# correct the spelling
def correct_spelling(text):
    return str(TextBlob(text).correct())

# remove the stop words and stem the words
def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [22]:
# load the data
data = pd.read_csv('../dataset/process/en-2020-01-merged-cleaned-without-emoji.tsv', sep='\t')

data['text'] = data['text'].astype(str)

print('Cleaning the text...')
tqdm.pandas(desc="Cleaning the text")
data['text'] = data['text'].progress_apply(clean_punctuation)

# print('Correcting the spelling...')
# tqdm.pandas(desc="Correcting the spelling")
# data['text'] = data['text'].progress_apply(correct_spelling)

print('Preprocessing the text...')
tqdm.pandas(desc="Preprocessing the text")
data['text'] = data['text'].progress_apply(preprocess_text)


Cleaning the text...


Cleaning the text: 100%|██████████| 129670/129670 [00:00<00:00, 180222.12it/s]


Preprocessing the text...


Preprocessing the text: 100%|██████████| 129670/129670 [00:09<00:00, 13370.55it/s]


In [23]:
# print nan
print('Checking for NaN...')
print(data.isnull().sum())

# print the average length of the text
print('Calculating the average length of the text...')
print(data['text'].apply(lambda x: len(x.split())).mean())

Checking for NaN...
tweetid      0
sentiment    0
evidence     0
text         0
dtype: int64
Calculating the average length of the text...
4.892280404102722


In [24]:
# Tokenization
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')

data['tokens'] = data['text'].apply(word_tokenize)

# 創建詞彙表
all_tokens = [token for tokens in data['tokens'] for token in tokens]
counter = Counter(all_tokens)
vocab = {word: idx for idx, (word, _) in enumerate(counter.items(), start=1)}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dylan\AppData\Roaming\nltk_data...


[nltk_data]   Package punkt is already up-to-date!


In [25]:
# load pretrained GloVe model
def load_glove_model(glove_file):
    print("Loading GloVe model...")
    model = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Loading GloVe model"):
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float32)
            model[word] = embedding
    print("GloVe model loaded!")
    return model

glove_model = load_glove_model('../dataset/glove_twitter_27B/glove.twitter.27B.200d.txt')

Loading GloVe model...


Loading GloVe model: 0it [00:00, ?it/s]

Loading GloVe model: 1193514it [01:31, 13045.80it/s]

GloVe model loaded!





In [26]:
embedding_dim = 200
embedding_matrix = np.zeros((len(vocab) + 1, embedding_dim))  # +1 for padding token

for word, i in vocab.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # Randomly initialize
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Convert texts to sequences
sequences = [[vocab.get(token, 0) for token in tokens] for tokens in data['tokens']]
max_len = 100
padded_sequences = np.zeros((len(sequences), max_len), dtype=int)
for i, seq in enumerate(sequences):
    if len(seq) > 0:
        padded_sequences[i, :len(seq)] = np.array(seq)[:max_len]

In [27]:
# Save preprocessed data
np.save('padded_sequences.npy', padded_sequences)
np.save('labels.npy', data['sentiment'].values)
np.save('embedding_matrix.npy', embedding_matrix)