In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from keras.models import Model
from keras.layers import *
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [20]:
text_x_src_path = '../../Dataset/mixed_text_assignment.json'
numb_x_src_path = '../../Dataset/MixedDataSet.json'
y_src_path = '../../DataBook/Mixed_Data_Analyst.xlsx'

In [21]:
text_data = pd.read_json(text_x_src_path)
numb_data = pd.read_json(numb_x_src_path)
numb_data = numb_data.iloc[:, :-59022]

In [22]:
df_supervision = pd.read_excel(y_src_path)
plagiarised_array = df_supervision['Plagiarised'].astype(int).values

In [23]:
texts = text_data.iloc[:, :].astype(str).values.tolist()

texts = [[element if element != 'None' else '' for element in sublist] for sublist in texts]

In [24]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>") 
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_sequence_len = max([len(seq) for seq in sequences])
data_vectorized = pad_sequences(sequences, maxlen=max_sequence_len)

In [25]:
X_T = data_vectorized
X_N = np.nan_to_num(numb_data.values, nan=0, copy=True).astype(int)
y = plagiarised_array
# from imblearn.over_sampling import SMOTE
# ros = SMOTE()
# X_resampled, y_resampled = ros.fit_resample(X_T, y)

In [26]:
X_T_train, X_T_test, y_T_train, y_T_test = train_test_split(X_T, y, test_size=0.2, random_state=32)

X_N_train, X_N_test, y_N_train, y_N_test = train_test_split(X_N, y, test_size=0.2, random_state=32)

In [27]:
textP = Perceptron()
textP.fit(X_T_train,y_T_train)

print(f"Text Training data score: {textP.score(X_T_train, y_T_train)}")
print(f"Text Test data score: {textP.score(X_T_test, y_T_test)}")

Text Training data score: 1.0
Text Test data score: 0.8888888888888888


In [28]:
numbP = Perceptron()
numbP.fit(X_N_train,y_N_train)

percepPredict = numbP.predict(X_N_test)

print(f"Text Training data score: {numbP.score(X_N_train, y_N_train)}")
print(f"Text Test data score: {numbP.score(X_N_test, y_N_test)}")

Text Training data score: 0.9528301886792453
Text Test data score: 0.9259259259259259


In [29]:
text_model = keras.models.Sequential([
    keras.layers.Input(shape=(10315,)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

LOSS_FN = keras.losses.BinaryCrossentropy()

text_model.compile(optimizer='adam', loss=LOSS_FN, metrics=['accuracy'])

text_model.fit(X_T_train, y_T_train, epochs=5, validation_data=(X_T_test, y_T_test))

y_pred_text_binary_flat = (text_model.predict(X_T_test).flatten() > 0.5).astype(int)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
NPH = 32
numb_model = keras.models.Sequential([
    keras.layers.Input(shape=(16689,)),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(NPH, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

LOSS_FN = keras.losses.BinaryCrossentropy()

numb_model.compile(optimizer='adam', loss=LOSS_FN, metrics=['accuracy'])

numb_model.fit(X_N_train, y_N_train, epochs=5, validation_data=(X_N_test, y_N_test))

y_pred_numb_binary_flat = (numb_model.predict(X_N_test).flatten() > 0.5).astype(int)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
# define two sets of inputs
inputA = Input(shape=(10315,))
inputB = Input(shape=(16689,))

# the first branch operates on the first input
x = Dense(NPH, activation="relu")(inputA)
x1 = Dense(NPH, activation="relu")(x)
x2 = Dense(NPH, activation="relu")(x1)
x3 = Dense(NPH, activation="relu")(x2)
x4 = Dense(NPH, activation="relu")(x3)
x5 = Dense(NPH, activation="relu")(x4)

# the second branch opreates on the second input
y = Dense(NPH, activation="relu")(inputB)
y1 = Dense(NPH, activation="relu")(y)
y2 = Dense(NPH, activation="relu")(y1)
y3 = Dense(NPH, activation="relu")(y2)
y4 = Dense(NPH, activation="relu")(y3)
y5 = Dense(NPH, activation="relu")(y4)

# combine the output of the two branches
combined = concatenate([x5, y5])

# apply a FC layer and then a regression prediction on the
# combined outputs
z = Dense(2, activation="relu")(combined)
z1 = Dense(1, activation="sigmoid")(z)

# our model will accept the inputs of the two branches and
# then output a single value
model = Model(inputs=[inputA, inputB], outputs=z1)

LOSS_FN = keras.losses.BinaryCrossentropy()

model.compile(optimizer='adam', loss=LOSS_FN, metrics=['accuracy'])

In [None]:
for numb, text, actual in zip(y_pred_numb_binary_flat, y_pred_text_binary_flat, y_N_test):
    print(f"Number: {numb}, Textual: {text} Actual: {actual}")

Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 1 Actual: 1
Number: 1, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 0 Actual: 1
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0
Number: 1, Textual: 1 Actual: 1
Number: 1, Textual: 0 Actual: 0
Number: 0, Textual: 0 Actual: 0


In [None]:
print(text_model.evaluate(X_T_test,y_T_test)[1])
print(numb_model.evaluate(X_N_test,y_N_test)[1])

0.8888888955116272
0.8518518805503845
