In [13]:
import pandas as pd
import numpy as np
import keras
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, GlobalAvgPool1D
from keras.preprocessing.text import Tokenizer
# from keras._tf_keras.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from keras import metrics

In [9]:
'''Step1 Data Processing'''
# Read the data.csv file into data frame
data_df = pd.read_csv("processed_data.csv", encoding='unicode_escape')
data1 = data_df.loc[:, "text"]        # content
data2 = data_df.loc[:, "generated"]   # labels
data_new = pd.concat([data1, data2], axis=1)
# print(data_new)

# Split the input data and label into train and test ones
def split_train_test(input_data, ratio_test):
    np.random.seed(314)
    # Randomly shuffle the order of the data points
    shuffled_indices = np.random.permutation(len(input_data))
    test_size = int(len(input_data) * ratio_test)
    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]
    return input_data.iloc[train_indices], input_data.iloc[test_indices]

# Split the data and labels into training and testing
train_data, test_data = split_train_test(data1, 0.2)
train_label, test_label = split_train_test(data2, 0.2)

# The input data is the text in format of string
texts_train = train_data.astype(str)
texts_test = test_data.astype(str)

In [10]:
# change the text and label into the numerical format
tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(texts_train)
sequences_train = tokenizer_train.texts_to_sequences(texts_train)
# print(sequences_train)
tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(texts_test)
sequences_test = tokenizer_test.texts_to_sequences(texts_test)
# print(sequences_test)

# padding the sequence into the same length
data_train = pad_sequences(sequences_train)
# print(data_train)
data_test = pad_sequences(sequences_test)
# print(data_test)

In [11]:
'''Step2: Convolutional Neural Network (CNN)'''
# the size of vocabulary list
vocab_size = len(data_train)
# the dimension of words after embedding
embedding_dim = 128
# the maximum number of words of each input line
max_length = 100 # longer part will be thrown, shorter part will be assigned as 0

# initialize the model
# Model structure: 
model = Sequential([
    # Embedding layer, transform the sequence into vectors
    Embedding(vocab_size, embedding_dim,
              input_length=max_length
              ),
    # 1-Dimension Convolution layers
    # Conv1D(filters, kernel_size, padding='valid', activation=None)
    Conv1D(256, 5, padding='same'
           ),
    GlobalMaxPooling1D(),
    # Full-connected layer
    # Dense(units, activation=None)
    # Output layer
    Dense(1,
          activation='sigmoid'
          )
    ])



In [14]:
# compiling the model
model.compile(loss='binary_crossentropy',   # loss function
              optimizer='adam',             # optimizer
              metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])         # assessment criteria

# Train the CNN model
model.fit(data_train, train_label,
          epochs=2,                        # epoch is the training round
          validation_split=0.2  # The ratio of evaluate set in training set
          )
# Get the summary of the CNN model
model.summary()

# test and evaluate the model, accacz = model.evaluate(data_test, test_label)
predict_result = model.predict(data_test)   # The classifying result of test data
# print(type(predict_result[0]))
# print(predict_result)
loss, acc = model.evaluate(data_test, test_label)
print("accuracy: ", acc)
print("loss: ", loss)

length = len(test_label)
# print(len(predict_result) == length)
score = 0
for i in range(length):
    current = 0
    if predict_result[i][0] > 0.5:
        current = 1
    if current == test_label.tolist()[i]:
        score += 1
print('Calculated accuracy: ', score/length)


Epoch 1/2
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 74ms/step - accuracy: 0.8743 - loss: 0.2437 - precision: 0.9213 - recall: 0.7027 - val_accuracy: 0.9863 - val_loss: 0.0429 - val_precision: 0.9787 - val_recall: 0.9871
Epoch 2/2
[1m583/583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 81ms/step - accuracy: 0.9950 - loss: 0.0169 - precision: 0.9936 - recall: 0.9937 - val_accuracy: 0.9863 - val_loss: 0.0447 - val_precision: 0.9711 - val_recall: 0.9952


[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.6387 - loss: 1.1827 - precision: 0.5424 - recall: 0.6915


ValueError: too many values to unpack (expected 2)