In [35]:
import pandas as pd
import tensorflow as tf
import numpy as np

# Load dataset
dataset = pd.read_csv('textdata/reddit_200k_train.csv', encoding="ISO-8859-1")
raw_train_input = dataset['body']
raw_train_output = dataset['REMOVED']

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=80000)
tokenizer.fit_on_texts(raw_train_input)
word_index = tokenizer.word_index

# Preprocess data
X_train = tokenizer.texts_to_sequences(raw_train_input)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=255)
y_train = np.array(raw_train_output, dtype=np.uint8)

# Define model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(80000, 16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.2),  # Dropout for regularization
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=20, batch_size=512, verbose=1)

# Evaluate model
test_dataset = pd.read_csv('textdata/reddit_200k_test.csv', encoding="ISO-8859-1")
raw_test_input = test_dataset['body']
raw_test_output = test_dataset['REMOVED']

X_test = tokenizer.texts_to_sequences(raw_test_input)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=255)
y_test = np.array(raw_test_output, dtype=np.uint8)

model.evaluate(X_test, y_test)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Predicted probability: [[0.64587134]]


In [43]:
sample_text = ["you are a good person"]
encoded_text = tokenizer.texts_to_sequences(sample_text)
padded_text = tf.keras.preprocessing.sequence.pad_sequences(encoded_text, padding='post', maxlen=255)
result = model.predict(padded_text)
print("Predicted probability:", result)

Predicted probability: [[0.57018346]]


In [40]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.6996221542358398
