In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
import tensorflow_datasets as tfds
import nltk
nltk.download('punkt')
nltk.data.find('tokenizers/punkt')
nltk.download('punkt_tab')
nltk.download('stopwords', force=True)


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
def clean_text(text, tokenizer, stopwords):
    text = str(text).lower().strip()  # Chuyển về chữ thường và xóa khoảng trắng dư thừa

    text = re.sub(r"\[(.*?)\]", "", text)  # Xóa nội dung trong dấu []
    text = re.sub(r"\s+", " ", text)  # Chuẩn hóa khoảng trắng
    text = re.sub(r"\.\.\.|…", "", text)  # Chỉ xóa dấu ba chấm
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Thay dấu '-' giữa các từ bằng khoảng trắng
    text = text.translate(str.maketrans("", "", string.punctuation))  # Xóa dấu câu

    tokens = tokenizer(text)  # Tách từ
    tokens = [t for t in tokens if t not in stopwords]  # Xóa stopwords
    tokens = [t for t in tokens if not t.isdigit()]  # Xóa số
    tokens = [t for t in tokens if len(t) > 1]  # Xóa từ quá ngắn (1 ký tự)

    return tokens  # Trả về chuỗi thay vì danh sách


In [16]:
# Kết nối Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Đặt đường dẫn thư mục trong Drive
data_path = "/content/drive/MyDrive/sentiment-analysis-for-movie-reviews/DataPreprocessing"

def file_write(X_train, X_test, y_train, y_test):
    train_data = pd.DataFrame({'sentence': X_train, 'label': y_train})
    test_data = pd.DataFrame({'sentence': X_test, 'label': y_test})

    train_data.to_csv(f"{data_path}/TrainingSet_orig_eda_embedding_wordnet.csv", index=False)
    test_data.to_csv(f"{data_path}/TestSet_orig_eda_embedding_wordnet.csv", index=False)

    print(f"Đã lưu file vào {data_path}/TrainingSet.csv và {data_path}/TestSet.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install datasets
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("aayya/sst2-augmented")




In [18]:
train_data = pd.DataFrame(ds['train_orig_eda_embedding_wordnet'])
test_data = pd.DataFrame(ds['val'])

X_train = pd.DataFrame({'sentence': train_data['sentence']})
X_test = pd.DataFrame({'sentence': test_data['sentence']})

y_train = train_data['label']
y_test = test_data['label']


In [19]:
stop_words = set(stopwords.words("english"))

X_train['sentence'] = X_train['sentence'].map(lambda x: clean_text(x, word_tokenize, stop_words))
X_test['sentence'] = X_test['sentence'].map(lambda x:clean_text(x, word_tokenize, stop_words))


In [20]:
X_train

Unnamed: 0,sentence
0,"[hide, new, secretions, parental, units]"
1,"[contains, wit, labored, gags]"
2,"[loves, characters, communicates, something, r..."
3,"[remains, utterly, satisfied, remain, throughout]"
4,"[worst, revenge, nerds, clichés, filmmakers, c..."
...,...
269389,"[delicious, comedy]"
269390,"[anguish, wrath, frustration]"
269391,"[achieving, scant, crowd, pleasing, goals, sets]"
269392,"[patient, onlooker]"


In [21]:
y_train

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0
...,...
269389,1
269390,0
269391,1
269392,1


In [22]:
X_test

Unnamed: 0,sentence
0,"[charming, often, affecting, journey]"
1,"[unflinchingly, bleak, desperate]"
2,"[allows, us, hope, nolan, poised, embark, majo..."
3,"[acting, costumes, music, cinematography, soun..."
4,"[slow, slow]"
...,...
867,"[depth, wading, pool]"
868,"[movie, real, anarchic, flair]"
869,"[subject, like, inspire, reaction, audience, p..."
870,"[arthritic, attempt, directing, callie, khouri]"


In [23]:
y_test

Unnamed: 0,label
0,1
1,0
2,1
3,1
4,0
...,...
867,0
868,1
869,0
870,0


In [24]:
file_write(X_train['sentence'].tolist(), X_test['sentence'].tolist(), y_train.tolist(), y_test.tolist())


Đã lưu file vào /content/drive/MyDrive/sentiment-analysis-for-movie-reviews/DataPreprocessing/TrainingSet.csv và /content/drive/MyDrive/sentiment-analysis-for-movie-reviews/DataPreprocessing/TestSet.csv
