In [1]:
import pandas as pd

In [None]:
# Replace 'your_file_path.csv' with the actual path to your CSV file
train_file_path = 'final_train.csv'
test_file_path = 'final_test.csv'

# Read the CSV file into a pandas DataFrame
train_df = pd.read_csv(train_file_path, header=None, names=['text', 'label'])
test_df = pd.read_csv(test_file_path, header=None, names=['text', 'label'])


# Display the first few rows of the DataFrame
print(train_df.head(10))

In [5]:
train_text = train_df['text']
train_label = train_df['label']

test_text = test_df['text']
test_label = test_df['label']

In [40]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [41]:
vocab_size = 50000

Tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
Tokenizer.fit_on_texts(train_text)

word_index = Tokenizer.word_index

In [None]:
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing the 'text' column
sequence_lengths = train_df['text'].apply(lambda x: len(x.split()))

# Plot a histogram of sequence lengths
plt.hist(sequence_lengths, bins=150)
plt.title('Distribution of Sequence Lengths')
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.show()

In [42]:
Tokenizer.fit_on_sequences(train_text)
train_sequences = Tokenizer.texts_to_sequences(train_text)
padded = pad_sequences(train_sequences, maxlen=1000, padding='post', truncating='post')

In [43]:
Tokenizer.fit_on_sequences(test_text)
test_sequences = Tokenizer.texts_to_sequences(test_text)
testing_padded = pad_sequences(test_sequences, maxlen=1000, padding='post', truncating='post')

In [44]:
embedding_dim = 200
max_length = 1000

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [45]:
import numpy as np

from sklearn.preprocessing import LabelEncoder

train_label = [str(label) for label in train_label]
test_label = [str(label) for label in test_label]

encoder = LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.transform(test_label)

train_label = np.array(train_label).astype('float32')
test_label = np.array(test_label).astype('float32')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(padded, train_label, epochs=30, validation_data=(testing_padded, test_label), verbose=1, callbacks=[early_stopping])

In [None]:
from keras.preprocessing.text import Tokenizer

# Create a tokenizer and fit it on your training data
# replace num_words with the actual number used in your model
tokenizer = Tokenizer(num_words=5000)
# replace train_data with your actual training data
tokenizer.fit_on_texts(train_text)

sentences = [
    'Respected sir, this is very heartly and peaceful message that \nI want to convey to you regarding my outstanding fees. I am a student of your school and I\n am in class 10th. I have been studying in your school for\n the last 5 years. I have always been a good student of \nyours',
    'This snippet ensures that TensorFlow \ndynamically allocates GPU memory as needed, preventing it \nfrom occupying the entire GPU memory at once.',
    'The more you eat sumeed the more ubaid you''ll get',
    'The american army recently invaded iraq and killed many innocent people have also stepped foot to manufacture drugs that might harm innocent people of Palestine.',
    'Our data is not AI generated instead it is human written. BELIEVE ME!'
]

# Convert texts to sequences
sample_sequences = tokenizer.texts_to_sequences(sentences)

padding_type = 'post'
# replace max_length with the actual length used in your model
sample_padded = pad_sequences(
    sample_sequences, padding=padding_type, maxlen=max_length)

# Predicting
classes = model.predict(sample_padded)

i = 0
for value in classes:
    if value > 0.5:
        print("\n\nText ", sentences[i])
        print("\nAI Generated :Positive with value of ", value)
    else:
        print("\n\nText ", sentences[i])
        print("\nHuman Written: Negative with value of ", value)
    i += 1

In [None]:
# Save
model.save('model.h5')