In [49]:
import pandas as pd
import numpy as np
import json
from tensorflow import keras
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


In [50]:
x_src_path = '../Dataset/assignment1New.json'
y_src_path = '../DataBook/Assignment1_Data_Analyst.xlsx'

In [51]:
data = pd.read_json(x_src_path)


In [53]:
df_supervision = pd.read_excel(y_src_path)
plagiarised_array = df_supervision['Plagiarised'].astype(int).values

In [54]:
# X = np.array(X_flattened_padded)
X = data.values
y = plagiarised_array

In [55]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imputed = imputer.fit_transform(X)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [None]:
# let's train a perceptron on these images

# first, we need to flatten our images (from 2d into a 1d array)
x_train_flat = X_train.reshape(X_train.shape[0], -1)
x_test_flat = X_test.reshape(X_test.shape[0], -1)

# create and train the perceptron
perceptron = Perceptron()
perceptron.fit(x_train_flat, y_train)

# print train and test accuracy scores
print(f"Training data score: {perceptron.score(x_train_flat, y_train)}")
print(f"Training data score: {perceptron.score(x_test_flat, y_test)}")


In [57]:
# let's use tensorflow keras library to build a 10 perceptron model
#  Dense layers are standard fully-interconnected neural network layers
model = keras.models.Sequential([
    keras.layers.Dense(10, activation='softmax') # output layer
])

# we'll use sparse_categorical_crossentropy as the loss function
LOSS_FN = keras.losses.sparse_categorical_crossentropy

# compile the model with standard backprop training algorithm called 'adam'
model.compile(optimizer='adam',loss=LOSS_FN,metrics=['accuracy'])

# train on training data, and validate on test data
# we'll train for 5 epochs
model.fit(x_train_flat, y_train, epochs=5, validation_data=(x_test_flat, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
# let's build a whole bunch of models to compare
modelNames = [
    '1d',
    '2d',
    '3d',
    '1c1d',
    '1c2d',
    '2c2d',
]

# in all models we'll have the same number of nodes per hidden layer
NODES_PER_HIDDEN_LAYER = 64

models = [
    # just the output layer
    keras.models.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(10, activation='softmax') # output layer
    ]),
    # 1 hidden layer
    keras.models.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(NODES_PER_HIDDEN_LAYER, activation='relu'),  # hidden layer 1
        keras.layers.Dense(10, activation='softmax') # output layer
    ]),
    # 2 hidden layers
    keras.models.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(NODES_PER_HIDDEN_LAYER, activation='relu'),  # hidden layer 1
        keras.layers.Dense(NODES_PER_HIDDEN_LAYER, activation='relu'),   # hidden layer 2
        keras.layers.Dense(10, activation='softmax') # output layer
    ]),
    # 1 convolutional layer
    keras.models.Sequential([
        keras.Input(shape=(28,28,1)),
        keras.layers.Conv2D(NODES_PER_HIDDEN_LAYER, kernel_size=(3, 3), activation="relu"),  #convolutional layer 1
        keras.layers.Flatten(),
        keras.layers.Dense(10, activation="softmax")  # output layer
    ]),
    # 1 convolutional layer and 1 hidden dense layer
    keras.models.Sequential([
        keras.Input(shape=(28,28,1)),
        keras.layers.Conv2D(NODES_PER_HIDDEN_LAYER, kernel_size=(3, 3), activation="relu"),  #convolutional layer 1
        keras.layers.Flatten(),
        keras.layers.Dense(NODES_PER_HIDDEN_LAYER, activation='relu'),  # hidden layer 1
        keras.layers.Dense(10, activation="softmax")  # output layer
    ]),
    # 2 convolutional layers and 1 hidden dense layer
    keras.models.Sequential([
        keras.Input(shape=(28,28,1)),
        keras.layers.Conv2D(NODES_PER_HIDDEN_LAYER, kernel_size=(3, 3), activation="relu"),  #convolutional layer 1
        keras.layers.Conv2D(NODES_PER_HIDDEN_LAYER, kernel_size=(3, 3), activation="relu"),  #convolutional layer 2
        keras.layers.Flatten(),
        keras.layers.Dense(NODES_PER_HIDDEN_LAYER, activation='relu'),  # hidden layer 1
        keras.layers.Dense(10, activation="softmax")  # output layer
    ])
]



Accuracy: 0.6363636363636364
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.83      0.71         6
           1       0.67      0.40      0.50         5

    accuracy                           0.64        11
   macro avg       0.65      0.62      0.61        11
weighted avg       0.64      0.64      0.62        11



In [None]:
# compile all models
for model in models:
    model.compile(optimizer='adam',loss=LOSS_FN,metrics=['accuracy'])

In [None]:
TRAINING_EPOCHS = 5

# train all models
for model, name in zip(models, modelNames):
    print(f'training model {name}')
    model.fit(X_train, y_train, epochs=TRAINING_EPOCHS)


In [None]:
# get all model accuracy scores on test data
scores = [model.evaluate(X_test,y_test)[1] for model in models]


In [None]:
# let's also add the perceptron score to our accuracy scores list
modelNames.insert(0, 'perceptron')
scores.insert(0, perceptron.score(x_test_flat, y_test) )

# print all models and scores
for name,score in zip(modelNames,scores):
    print(name, score)

In [None]:
# display as a bar chart
plt.bar(modelNames,scores)
plt.ylim(0.8, 1.0)
plt.show()