In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)

In [4]:
import bz2

def decompress_bz2(file_path, output_path):
    with bz2.open(file_path, 'rt', encoding='utf-8') as file:
        with open(output_path, 'w', encoding='utf-8') as out_file:
            out_file.write(file.read())

# Decompress the files
decompress_bz2('/kaggle/input/amazonreviews/train.ft.txt.bz2', 'train.ft.txt')
decompress_bz2('/kaggle/input/amazonreviews/test.ft.txt.bz2', 'test.ft.txt') 

In [5]:
def parse_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            label, text = line.split(' ', 1)
            label = int(label.replace('__label__', ''))
            data.append((label, text.strip()))
    return pd.DataFrame(data, columns=['label', 'text'])

In [6]:
train = parse_data('train.ft.txt')
test = parse_data('test.ft.txt')

print("Train:")
train

Train:


Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
3599995,1,Don't do it!!: The high chair looks great when...
3599996,1,"Looks nice, low functionality: I have used thi..."
3599997,1,"compact, but hard to clean: We have a small ho..."
3599998,1,what is it saying?: not sure what this book is...


In [7]:
df = train.copy()
df2 = test.copy()

In [8]:
df.label = df.label.map({2:0, 1:1})  # 1 are negatives (targeted class)
df2.label = df2.label.map({2:0, 1:1})  # 1 are negatives (targeted class)

df

Unnamed: 0,label,text
0,0,Stuning even for the non-gamer: This sound tra...
1,0,The best soundtrack ever to anything.: I'm rea...
2,0,Amazing!: This soundtrack is my favorite music...
3,0,Excellent Soundtrack: I truly like this soundt...
4,0,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
3599995,1,Don't do it!!: The high chair looks great when...
3599996,1,"Looks nice, low functionality: I have used thi..."
3599997,1,"compact, but hard to clean: We have a small ho..."
3599998,1,what is it saying?: not sure what this book is...


In [9]:
df.label.value_counts(normalize=True)
df2.label.value_counts(normalize=True)


label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [10]:
df = pd.concat([df, df2], ignore_index=True)

In [11]:
df.info

<bound method DataFrame.info of          label                                               text
0            0  Stuning even for the non-gamer: This sound tra...
1            0  The best soundtrack ever to anything.: I'm rea...
2            0  Amazing!: This soundtrack is my favorite music...
3            0  Excellent Soundtrack: I truly like this soundt...
4            0  Remember, Pull Your Jaw Off The Floor After He...
...        ...                                                ...
3999995      1  Unbelievable- In a Bad Way: We bought this Tho...
3999996      1  Almost Great, Until it Broke...: My son reciev...
3999997      1  Disappointed !!!: I bought this toy for my son...
3999998      0  Classic Jessica Mitford: This is a compilation...
3999999      1  Comedy Scene, and Not Heard: This DVD will be ...

[4000000 rows x 2 columns]>

In [12]:
df.to_csv('amazon_reviews.csv', index=False)

In [13]:
df.head()

Unnamed: 0,label,text
0,0,Stuning even for the non-gamer: This sound tra...
1,0,The best soundtrack ever to anything.: I'm rea...
2,0,Amazing!: This soundtrack is my favorite music...
3,0,Excellent Soundtrack: I truly like this soundt...
4,0,"Remember, Pull Your Jaw Off The Floor After He..."


In [14]:
import re

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Define the feature (X) and target (y) columns
X = df['text']
y = df['label']


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [16]:
# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)

# Train Naive Bayes model
#rf_model = RandomForestClassifier(n_estimators=100)
#rf_model.fit(X_train_vectorized, y_train)

# Train Naive Bayes model
lg_model = LogisticRegression()
lg_model.fit(X_train_vectorized, y_train)

# Make predictions and evaluate the model
#y_pred = rf_model.predict(X_test_vectorized)
#print("Random Forest Model Performance")
#print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))

# Make predictions and evaluate the model
y_pred = nb_model.predict(X_test_vectorized)
print("Naive Bayes Model Performance")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Make predictions and evaluate the model
y_pred = lg_model.predict(X_test_vectorized)
print("Logistic Regression Model Performance")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Naive Bayes Model Performance
Accuracy: 0.8481
              precision    recall  f1-score   support

           0       0.85      0.84      0.85    399907
           1       0.84      0.85      0.85    400093

    accuracy                           0.85    800000
   macro avg       0.85      0.85      0.85    800000
weighted avg       0.85      0.85      0.85    800000

Logistic Regression Model Performance
Accuracy: 0.900755
              precision    recall  f1-score   support

           0       0.90      0.90      0.90    399907
           1       0.90      0.90      0.90    400093

    accuracy                           0.90    800000
   macro avg       0.90      0.90      0.90    800000
weighted avg       0.90      0.90      0.90    800000



In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

In [18]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

# LSTM modelini oluşturma
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

# Modeli derleme
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping tanımlama
early_stopping = EarlyStopping(
    monitor='val_loss',  # İzlenecek metrik
    patience=3,          # Doğrulama kaybı iyileşmediğinde kaç epoch bekleyeceği
    restore_best_weights=True  # En iyi ağırlıkları geri yükle
)

# Modeli eğitme
history = model.fit(
    X_train_padded, y_train,
    epochs=3,  # Eğitim süresini uzatabiliriz çünkü early stopping ile erken durdurma yapılacak
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]  # Early stopping'i geri çağırmalara ekleyin
)

# Modeli değerlendirme
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("LSTM Model Performansı")
print("Accuracy:", accuracy)

Epoch 1/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m830s[0m 9ms/step - accuracy: 0.9213 - loss: 0.1993 - val_accuracy: 0.9431 - val_loss: 0.1501
Epoch 2/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m828s[0m 9ms/step - accuracy: 0.9480 - loss: 0.1392 - val_accuracy: 0.9475 - val_loss: 0.1408
Epoch 3/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m829s[0m 9ms/step - accuracy: 0.9521 - loss: 0.1293 - val_accuracy: 0.9481 - val_loss: 0.1390
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 4ms/step - accuracy: 0.9480 - loss: 0.1392
LSTM Model Performansı
Accuracy: 0.9478762745857239


In [19]:
"""from sklearn.metrics import accuracy_score, f1_score

# Tahminler (0 veya 1)
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

# Skorlar
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")"""

'from sklearn.metrics import accuracy_score, f1_score\n\n# Tahminler (0 veya 1)\ny_pred = (model.predict(X_test_padded) > 0.5).astype("int32")\n\n# Skorlar\nacc = accuracy_score(y_test, y_pred)\nf1 = f1_score(y_test, y_pred)\n\nprint(f"Accuracy: {acc:.4f}")\nprint(f"F1 Score: {f1:.4f}")'

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Tahminler (0 veya 1)
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

# Skorlar
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")


[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 4ms/step
Accuracy : 0.9479
F1 Score : 0.9480
Precision: 0.9461
Recall   : 0.9499


In [21]:
# Save the entire model
model.save("lstm_model.h5")  # You can also use .keras extension (e.g., "lstm_model.keras")


In [22]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.callbacks import EarlyStopping

cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

cnn_history = cnn_model.fit(
    X_train_padded, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]
)

loss, accuracy = cnn_model.evaluate(X_test_padded, y_test)
print("📊 CNN Model Performansı")
print("Accuracy:", accuracy)

Epoch 1/5
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 3ms/step - accuracy: 0.9235 - loss: 0.1941 - val_accuracy: 0.9420 - val_loss: 0.1545
Epoch 2/5
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 3ms/step - accuracy: 0.9437 - loss: 0.1502 - val_accuracy: 0.9425 - val_loss: 0.1526
Epoch 3/5
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 3ms/step - accuracy: 0.9499 - loss: 0.1360 - val_accuracy: 0.9410 - val_loss: 0.1577
Epoch 4/5
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 3ms/step - accuracy: 0.9542 - loss: 0.1258 - val_accuracy: 0.9427 - val_loss: 0.1554
Epoch 5/5
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 3ms/step - accuracy: 0.9582 - loss: 0.1167 - val_accuracy: 0.9408 - val_loss: 0.1637
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - accuracy: 0.9421 - loss: 0.1529
📊 CNN Model Performansı
Accuracy: 0.942007482051849

In [23]:
from sklearn.metrics import accuracy_score, f1_score

# Tahminler (0 veya 1)
y_pred = (cnn_model.predict(X_test_padded) > 0.5).astype("int32")

# Skorlar
cnn_acc = accuracy_score(y_test, y_pred)
cnn_f1 = f1_score(y_test, y_pred)
cnn_precision = precision_score(y_test, y_pred)
cnn_recall = recall_score(y_test, y_pred)

print(f"Accuracy : {cnn_acc:.4f}")
print(f"F1 Score : {cnn_f1:.4f}")
print(f"Precision: {cnn_precision:.4f}")
print(f"Recall   : {cnn_recall:.4f}")



[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1ms/step
Accuracy : 0.9420
F1 Score : 0.9414
Precision: 0.9515
Recall   : 0.9315


In [24]:
cnn_model.save("cnn_model.h5")


In [25]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense

gru_model = Sequential()
gru_model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
gru_model.add(GRU(64))
gru_model.add(Dense(1, activation='sigmoid'))

gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

gru_history = gru_model.fit(
    X_train_padded, y_train,
    epochs=3,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]
)

loss, accuracy = gru_model.evaluate(X_test_padded, y_test)
print("📊 GRU Model Performansı")
print("Accuracy:", accuracy)

Epoch 1/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m797s[0m 9ms/step - accuracy: 0.9243 - loss: 0.1893 - val_accuracy: 0.9457 - val_loss: 0.1440
Epoch 2/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m800s[0m 9ms/step - accuracy: 0.9481 - loss: 0.1383 - val_accuracy: 0.9466 - val_loss: 0.1415
Epoch 3/3
[1m90000/90000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m796s[0m 9ms/step - accuracy: 0.9505 - loss: 0.1327 - val_accuracy: 0.9460 - val_loss: 0.1431
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 4ms/step - accuracy: 0.9462 - loss: 0.1420
📊 GRU Model Performansı
Accuracy: 0.9458462595939636


In [26]:
from sklearn.metrics import accuracy_score, f1_score

# Tahminler (0 veya 1)
y_pred = (gru_model.predict(X_test_padded) > 0.5).astype("int32")

# Skorlar
gru_acc = accuracy_score(y_test, y_pred)
gru_f1 = f1_score(y_test, y_pred)
gru_precision = precision_score(y_test, y_pred)
gru_recall = recall_score(y_test, y_pred)

print(f"Accuracy : {gru_acc:.4f}")
print(f"F1 Score : {gru_f1:.4f}")
print(f"Precision: {gru_precision:.4f}")
print(f"Recall   : {gru_recall:.4f}")


[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 4ms/step
Accuracy : 0.9458
F1 Score : 0.9458
Precision: 0.9468
Recall   : 0.9448


In [27]:
gru_model.save("gru_model.h5")


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFAutoModel, AutoTokenizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('amazon_reviews.csv')
X = data['text'].astype(str)
y = data['label']

# Split dataset
X_train_texts, X_test_texts, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization and padding (for CNN & LSTM views)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_texts)
X_train_sequences = tokenizer.texts_to_sequences(X_train_texts)
X_test_sequences = tokenizer.texts_to_sequences(X_test_texts)

max_sequence_length = max(max(len(seq) for seq in X_train_sequences), 100)  # safety max

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Transformer tokenization
transformer_model_name = "bert-base-uncased"
transformer_tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
transformer_model = TFAutoModel.from_pretrained(transformer_model_name)

def encode_transformer(texts, tokenizer, max_len):
    encoding = tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )
    return encoding['input_ids']

X_train_transformer = encode_transformer(X_train_texts, transformer_tokenizer, max_sequence_length)
X_test_transformer = encode_transformer(X_test_texts, transformer_tokenizer, max_sequence_length)

# Define Inputs
cnn_input = Input(shape=(max_sequence_length,))
lstm_input = Input(shape=(max_sequence_length,))
transformer_input = Input(shape=(max_sequence_length,), dtype=tf.int32)

# CNN View
cnn_embedding = Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length)(cnn_input)
cnn_conv = Conv1D(filters=128, kernel_size=5, activation='relu')(cnn_embedding)
cnn_pool = GlobalMaxPooling1D()(cnn_conv)

# LSTM View
lstm_embedding = Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length)(lstm_input)
lstm_layer = LSTM(64)(lstm_embedding)

# Transformer View
transformer_embedding = transformer_model(transformer_input)[0]
transformer_pool = GlobalMaxPooling1D()(transformer_embedding)

# Merge all views
merged = Concatenate()([cnn_pool, lstm_layer, transformer_pool])
merged = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(merged)

# Define and compile model
model = Model(inputs=[cnn_input, lstm_input, transformer_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(
    [X_train_padded, X_train_padded, X_train_transformer],
    y_train,
    validation_split=0.1,
    epochs=3,
    batch_size=32
)

# Evaluate on test data
y_pred_probs = model.predict([X_test_padded, X_test_padded, X_test_transformer])
y_pred = (y_pred_probs > 0.5).astype(int)

# Print metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w