In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [36]:
text_x_src_path = '../../Dataset/mixed_text_assignment.json'
numb_x_src_path = '../../Dataset/MixedDataSet.json'
y_src_path = '../../DataBook/Mixed_Data_Analyst.xlsx'

In [37]:
text_data = pd.read_json(text_x_src_path)
numb_data = pd.read_json(numb_x_src_path)

In [38]:
df_supervision = pd.read_excel(y_src_path)
plagiarised_array = df_supervision['Plagiarised'].astype(int).values

In [39]:
texts = text_data.iloc[:, :].astype(str).values.tolist()

texts = [[element if element != 'None' else '' for element in sublist] for sublist in texts]

In [40]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>") 
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_sequence_len = max([len(seq) for seq in sequences])
data_vectorized = pad_sequences(sequences, maxlen=max_sequence_len)

In [41]:
X_T = data_vectorized
X_N = np.nan_to_num(numb_data.values, nan=0, copy=True).astype(int)
y = plagiarised_array

In [42]:
X_T_train, X_T_test, y_T_train, y_T_test = train_test_split(X_T, y, test_size=0.3, random_state=14)

X_N_train, X_N_test, y_N_train, y_N_test = train_test_split(X_N, y, test_size=0.3, random_state=14)

In [43]:
textP = Perceptron()
textP.fit(X_T_train,y_T_train)

print(f"Text Training data score: {textP.score(X_T_train, y_T_train)}")
print(f"Text Test data score: {textP.score(X_T_test, y_T_test)}")

Text Training data score: 1.0
Text Test data score: 0.85


In [44]:
numbP = Perceptron()
numbP.fit(X_N_train,y_N_train)

print(f"Text Training data score: {numbP.score(X_N_train, y_N_train)}")
print(f"Text Test data score: {numbP.score(X_N_test, y_N_test)}")

Text Training data score: 1.0
Text Test data score: 0.7


In [50]:
text_model = keras.models.Sequential([
    keras.layers.Input(shape=(10315,)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

LOSS_FN = keras.losses.BinaryCrossentropy()

text_model.compile(optimizer='adam', loss=LOSS_FN, metrics=['accuracy'])

text_model.fit(X_T_train, y_T_train, epochs=5, validation_data=(X_T_test, y_T_test))

# threshold = 0.5
# y_pred_text = text_model.predict(X_T_test)
# y_pred_text_binary = np.where(y_pred_text >= threshold, 1, 0)
# y_pred_text_binary_flat = y_pred_text_binary.flatten()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


array([[0.00000000e+00],
       [4.01294005e-27],
       [3.95603440e-34],
       [9.89313781e-01],
       [1.81896024e-18],
       [6.98627354e-14],
       [0.00000000e+00],
       [3.81166282e-16],
       [5.63906311e-10],
       [4.30180904e-08],
       [1.00388963e-04],
       [8.76297560e-31],
       [3.03808036e-36],
       [9.90752444e-27],
       [0.00000000e+00],
       [9.68705565e-02],
       [2.89682448e-01],
       [2.48905952e-12],
       [7.21581877e-24],
       [2.66460338e-05],
       [3.34713830e-25],
       [6.39766684e-09],
       [1.04581655e-26],
       [0.00000000e+00],
       [3.31481195e-18],
       [2.46422197e-12],
       [0.00000000e+00],
       [3.23099535e-23],
       [2.55065570e-05],
       [2.49235634e-27],
       [9.15361627e-04],
       [2.67707524e-32],
       [6.68942847e-14],
       [4.01811398e-36],
       [0.00000000e+00],
       [0.00000000e+00],
       [1.58136443e-34],
       [5.84838532e-32],
       [9.99676228e-01],
       [0.00000000e+00]],

In [58]:
predicted_classes = (text_model.predict(X_T_test).flatten() > 0.5).astype(int)
ac_sc = accuracy_score(y_T_test, predicted_classes)
print(ac_sc)
for text, actual in zip(predicted_classes, y_T_test):
    print(f" Textual: {text} Actual: {actual}")

0.9
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 1 Actual: 1
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 1
 Textual: 0 Actual: 1
 Textual: 0 Actual: 0
 Textual: 0 Actual: 1
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 1
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 0 Actual: 0
 Textual: 1 Actual: 1
 Textual: 0 Actual: 0


In [46]:
numb_model = keras.models.Sequential([
    keras.layers.Input(shape=(75711,)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

LOSS_FN = keras.losses.BinaryCrossentropy()

numb_model.compile(optimizer='adam', loss=LOSS_FN, metrics=['accuracy'])

numb_model.fit(X_N_train, y_N_train, epochs=5, validation_data=(X_N_test, y_N_test))

threshold = 0.5
y_pred_numb = numb_model.predict(X_N_test)
y_pred_numb_binary = np.where(y_pred_numb >= threshold, 1, 0)
y_pred_numb_binary_flat = y_pred_numb_binary.flatten()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
for numb, text, actual in zip(y_pred_numb_binary_flat, y_pred_numb_binary_flat, y_N_test):
    print(f"Number: {numb}, Textual: {text} Actual: {actual}")

Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 1
Number: 1, Textual: 1 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 1 Actual: 0
Number: 1, Textual: 1 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 1 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 

In [48]:
print(text_model.evaluate(X_T_test,y_T_test)[1])
print(numb_model.evaluate(X_N_test,y_N_test)[1])

0.925000011920929
0.7749999761581421
