# LSTM English Model

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import tensorflow as tf
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
import re
import random as rn
import torch
from sklearn.metrics import f1_score

rn.seed(1349)
np.random.seed(1349)
torch.manual_seed(1349)
torch.cuda.manual_seed(1349)

In [3]:
train_df = pd.read_csv("data/english/aclImdb/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/english/aclImdb/train.tsv", delimiter='\t', encoding='utf-8')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)

X_train = train_df['text_a']
y_train = train_df['label']

X_test = test_df['text_a']
y_test = test_df['label']

In [4]:
X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [5]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 741ms/step - accuracy: 0.5562 - loss: 0.6879 - val_accuracy: 0.0260 - val_loss: 0.9013
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 543ms/step - accuracy: 0.5813 - loss: 0.6802 - val_accuracy: 0.0552 - val_loss: 0.9081
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m420s[0m 597ms/step - accuracy: 0.5720 - loss: 0.6587 - val_accuracy: 0.0552 - val_loss: 0.8578
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 602ms/step - accuracy: 0.6385 - loss: 0.6017 - val_accuracy: 0.8136 - val_loss: 0.5295
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 422ms/step - accuracy: 0.8640 - loss: 0.3404 - val_accuracy: 0.7768 - val_loss: 0.5305
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 442ms/step - accuracy: 0.9115 - loss: 0.2457 - val_accuracy: 0.7452 - val_loss: 0.6844
Epoc

In [6]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 181ms/step
Test F1 Score: 0.9656638325703075


In [7]:
from keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 294ms/step - accuracy: 0.7403 - loss: 0.4985 - val_accuracy: 0.8572 - val_loss: 0.3166
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 355ms/step - accuracy: 0.8938 - loss: 0.2717 - val_accuracy: 0.7740 - val_loss: 0.6986
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 350ms/step - accuracy: 0.8460 - loss: 0.3647 - val_accuracy: 0.7904 - val_loss: 0.4148
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 351ms/step - accuracy: 0.9088 - loss: 0.2364 - val_accuracy: 0.8316 - val_loss: 0.4406
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 344ms/step - accuracy: 0.9292 - loss: 0.1937 - val_accuracy: 0.8248 - val_loss: 0.4232
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 350ms/step - accuracy: 0.9448 - loss: 0.1558 - val_accuracy: 0.7624 - val_loss: 0.6079
Epoc

In [8]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 174ms/step
Test F1 Score: 0.9660158483784004


# LSTM French Model

In [9]:
train_df = pd.read_csv("data/french/data/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/french/data/train.tsv", delimiter='\t', encoding='utf-8')
val_df = pd.read_csv("data/french/data/val.tsv", delimiter='\t', encoding='utf-8')

stop_words = set(stopwords.words('french'))

def clean_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)
val_df['text_a'] = val_df['text_a'].apply(clean_text)

In [10]:
X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [11]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 375ms/step - accuracy: 0.5584 - loss: 0.6874 - val_accuracy: 0.0248 - val_loss: 0.8224
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 399ms/step - accuracy: 0.5825 - loss: 0.6781 - val_accuracy: 0.0712 - val_loss: 0.7696
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 404ms/step - accuracy: 0.5822 - loss: 0.6526 - val_accuracy: 0.1128 - val_loss: 0.7653
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 363ms/step - accuracy: 0.6686 - loss: 0.6082 - val_accuracy: 0.8332 - val_loss: 0.5161
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 441ms/step - accuracy: 0.7489 - loss: 0.5161 - val_accuracy: 0.9172 - val_loss: 0.3850
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 422ms/step - accuracy: 0.7407 - loss: 0.5256 - val_accuracy: 0.8664 - val_loss: 0.3923
Epoc

In [12]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 327ms/step
Test F1 Score: 0.9546214375993802


In [13]:
from keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(Bidirectional(LSTM(128)))  # Wrapping LSTM layer with Bidirectional
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 515ms/step - accuracy: 0.7369 - loss: 0.5031 - val_accuracy: 0.9140 - val_loss: 0.2162
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 449ms/step - accuracy: 0.8811 - loss: 0.3157 - val_accuracy: 0.8696 - val_loss: 0.3427
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 616ms/step - accuracy: 0.9203 - loss: 0.2155 - val_accuracy: 0.8740 - val_loss: 0.3065
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 801ms/step - accuracy: 0.9326 - loss: 0.1845 - val_accuracy: 0.8392 - val_loss: 0.4548
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 478ms/step - accuracy: 0.9390 - loss: 0.1638 - val_accuracy: 0.8436 - val_loss: 0.4466
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 562ms/step - accuracy: 0.9567 - loss: 0.1219 - val_accuracy: 0.8100 - val_loss: 0.6257
Epoc

In [14]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 173ms/step
Test F1 Score: 0.9763785908188485


# LSTM Chinese Model

In [15]:
import jieba
import pandas as pd


train_df = pd.read_csv("data/chinese/chnsenticorp-main/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/chinese/chnsenticorp-main/test.tsv", delimiter='\t', encoding='utf-8')
val_df = pd.read_csv("data/chinese/chnsenticorp-main/dev.tsv", delimiter='\t', encoding='utf-8')

def load_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = set(file.read().splitlines())
    return stop_words

stop_words = load_stop_words('data/chinese/stopwords-zh.txt')

def clean_text(text):
    words = jieba.cut(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)
val_df['text_a'] = val_df['text_a'].apply(clean_text)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/8y/pwclkfq17dgfr91dw_mc9flr0000gn/T/jieba.cache
Loading model cost 0.593 seconds.
Prefix dict has been built successfully.


In [16]:
X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [17]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 212ms/step - accuracy: 0.5562 - loss: 0.6876 - val_accuracy: 0.0252 - val_loss: 0.8719
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 182ms/step - accuracy: 0.5633 - loss: 0.6914 - val_accuracy: 0.0632 - val_loss: 0.7331
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 206ms/step - accuracy: 0.5590 - loss: 0.6796 - val_accuracy: 0.0400 - val_loss: 0.8277
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 170ms/step - accuracy: 0.5792 - loss: 0.6629 - val_accuracy: 0.0476 - val_loss: 0.9793
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 216ms/step - accuracy: 0.5959 - loss: 0.6715 - val_accuracy: 0.9124 - val_loss: 0.6334
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 223ms/step - accuracy: 0.7709 - loss: 0.4819 - val_accuracy: 0.8104 - val_loss: 0.4525
Epoc

In [18]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 174ms/step
Test F1 Score: 0.9401017668977122


In [19]:
from keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 495ms/step - accuracy: 0.7353 - loss: 0.5222 - val_accuracy: 0.5940 - val_loss: 0.6685
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m375s[0m 533ms/step - accuracy: 0.8565 - loss: 0.3579 - val_accuracy: 0.8164 - val_loss: 0.4156
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 555ms/step - accuracy: 0.9132 - loss: 0.2401 - val_accuracy: 0.8696 - val_loss: 0.3354
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 516ms/step - accuracy: 0.9275 - loss: 0.1978 - val_accuracy: 0.8176 - val_loss: 0.4355
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 347ms/step - accuracy: 0.9321 - loss: 0.1869 - val_accuracy: 0.8056 - val_loss: 0.5097
Epoch 6/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 335ms/step - accuracy: 0.9491 - loss: 0.1443 - val_accuracy: 0.9100 - val_loss: 0.2769
Epoc

In [20]:
y_pred = model.predict(X_test)
y_pred_binary = [1 if val > 0.5 else 0 for val in y_pred]

f1 = f1_score(y_test, y_pred_binary)

print("Test F1 Score:", f1)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 143ms/step
Test F1 Score: 0.9531612181280473
