In [1]:
!pip install keras
!pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
import argparse
import numpy as np
from keras import backend as K
import matplotlib.pyplot as plt
from keras.optimizers import RMSprop
from keras.models import Model, Sequential
from keras.layers import Input, Conv1D, MaxPooling1D, Lambda, LSTM, Dropout, BatchNormalization, Activation
import tensorflow as tf  # Import TensorFlow to use math functions


Mounted at /content/drive


## •	Loading the Dataset

In [3]:
data_path = '/content/drive/My Drive/Gungor_2018_VictorianAuthorAttribution_data-train.csv'
data = pd.read_csv(data_path, encoding='ISO-8859-1')
print(data.head())


                                                text  author
0  ou have time to listen i will give you the ent...       1
1  wish for solitude he was twenty years of age a...       1
2  and the skirt blew in perfect freedom about th...       1
3  of san and the rows of shops opposite impresse...       1
4  an hour s walk was as tiresome as three in a s...       1


This step involves preparing text data for the Siamese Neural Network by tokenizing it at the character level and creating pairs of text samples labeled as "similar" or "dissimilar." Positive pairs (texts by the same author) are labeled as 1, while negative pairs (texts by different authors) are labeled as 0. Texts are converted into numerical sequences, padded to a uniform length, and stored as input pairs with labels, ensuring a balanced dataset for effective training










In [4]:
# Tokenize the text at the character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['text'])
def create_pairs(data, tokenizer, max_length=800, num_negative=1, max_pairs_per_author=10, max_authors=10):
    """
    Create positive and negative pairs with optimizations.
    """
    pairs = []
    labels = []
    authors = data.groupby('author')['text'].apply(list).to_dict()
    author_list = list(authors.keys())[:max_authors]  # Limit to a subset of authors

    for author_index, author in enumerate(author_list):
        print(f"Processing author {author_index + 1}/{len(author_list)}: {author}")

        # Limit the number of texts per author
        texts = authors[author][:max_pairs_per_author]

        # Positive pairs
        for i in range(len(texts)):
            for j in range(i + 1, len(texts)):
                seq1 = tokenizer.texts_to_sequences([texts[i]])[0]
                seq2 = tokenizer.texts_to_sequences([texts[j]])[0]
                seq1 = pad_sequences([seq1], maxlen=max_length)[0]
                seq2 = pad_sequences([seq2], maxlen=max_length)[0]
                pairs.append(np.hstack((seq1, seq2)))
                labels.append(1)

        # Negative pairs
        for text in texts:
            for _ in range(num_negative):
                other_author = random.choice([a for a in author_list if a != author])
                other_text = random.choice(authors[other_author])
                seq1 = tokenizer.texts_to_sequences([text])[0]
                seq2 = tokenizer.texts_to_sequences([other_text])[0]
                seq1 = pad_sequences([seq1], maxlen=max_length)[0]
                seq2 = pad_sequences([seq2], maxlen=max_length)[0]
                pairs.append(np.hstack((seq1, seq2)))
                labels.append(0)

    return np.array(pairs), np.array(labels)

The dataset was split into 80% for training and 20% for testing using the `train_test_split` function with a fixed random state for consistent results. From the training and testing data, pairs of texts were created using the `create_pairs` function. A maximum of 10 authors were selected for training, with up to 20 text pairs generated per author. The output included paired sequences (`Xtrain` and `Xtest`) and their binary labels (`Ytrain` and `Ytest`), indicating whether the paired texts were written by the same author or different authors.

In [5]:

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
Xtrain, Ytrain = create_pairs(train_data, tokenizer, max_authors=10, max_pairs_per_author=20)
Xtest, Ytest = create_pairs(test_data, tokenizer, max_authors=10, max_pairs_per_author=20)

Processing author 1/10: 1
Processing author 2/10: 2
Processing author 3/10: 3
Processing author 4/10: 4
Processing author 5/10: 6
Processing author 6/10: 8
Processing author 7/10: 9
Processing author 8/10: 10
Processing author 9/10: 11
Processing author 10/10: 12
Processing author 1/10: 1
Processing author 2/10: 2
Processing author 3/10: 3
Processing author 4/10: 4
Processing author 5/10: 6
Processing author 6/10: 8
Processing author 7/10: 9
Processing author 8/10: 10
Processing author 9/10: 11
Processing author 10/10: 12


In [6]:
#Save the data as .npy files
np.save('Xtrain.npy', Xtrain)
np.save('Ytrain.npy', Ytrain)
np.save('Xtest.npy', Xtest)
np.save('Ytest.npy', Ytest)

print("Data preprocessing complete. Files saved as Xtrain.npy, Ytrain.npy, Xtest.npy, Ytest.npy.")

Data preprocessing complete. Files saved as Xtrain.npy, Ytrain.npy, Xtest.npy, Ytest.npy.


The `LoadData` function loads the training and testing data, splits each input pair into two parts (left and right), and prepares it for the Siamese Neural Network. It also reshapes the data if needed and calculates the size of the inputs to ensure they fit the model. The function returns the processed data and labels, ready for training and testing.


In [7]:
def LoadData(path_Xtrain, path_Ytrain, path_xtest, path_ytest):
    # Load data
    Xtrain = np.load(path_Xtrain)
    Ytrain = np.load(path_Ytrain)
    Xtest = np.load(path_xtest)
    Ytest = np.load(path_ytest)

    # Reshape data to add a channel dimension if it's missing
    Xtrain = Xtrain.reshape((Xtrain.shape[0], Xtrain.shape[1], 1))
    Xtest = Xtest.reshape((Xtest.shape[0], Xtest.shape[1], 1))

    # Split input vectors into two parts
    XtrainLeft = Xtrain[:, 0:800, :]
    XtrainRigth = Xtrain[:, 800:1600, :]
    XtestLeft = Xtest[:, 0:800, :]
    XtestRigth = Xtest[:, 800:1600, :]

    longitud = XtrainLeft.shape[1]
    dimension = XtrainLeft.shape[2]

    return XtrainLeft, XtrainRigth, Ytrain, XtestLeft, XtestRigth, Ytest, longitud, dimension

this architecture is designed to extract features from input sequences and represent them in a compact form for similarity comparison. It balances feature extraction (via convolutional layers) with sequential learning (via the LSTM layer), making it effective for tasks like authorship verification.

In [8]:
def SiameseArquitecture(longitud, dimension):

	model = Sequential()
	model.add(Conv1D(75, 12, input_shape=(longitud, dimension)))
	model.add(BatchNormalization())
	model.add(Activation('relu'))
	model.add(Dropout(0.1))
	model.add(Conv1D(50, 12))
	model.add(Activation('relu'))
	model.add(BatchNormalization())
	model.add(Dropout(0.1))
	model.add(MaxPooling1D(4))
	model.add(LSTM(64, recurrent_dropout=0.1, return_sequences=False))
	model.add(Activation('relu'))

	model.summary()
	return model

This function is used to measure the similarity between two feature vectors in the Siamese Neural Network. A smaller distance indicates higher similarity, while a larger distance implies dissimilarity.

In [9]:
def euclidean_distance(vects):
    x, y = vects
    return tf.math.sqrt(tf.math.maximum(tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True), K.epsilon()))

This function defines the shape of the output for the Euclidean distance layer in a Siamese Neural Network

In [10]:
def eucl_dist_output_shape(shapes):
	shape1, shape2 = shapes
	return (shape1[0], 1)

This function measures the network's ability to differentiate similar and dissimilar pairs effectively(based on a distance threshold of 0.5).It's a simple and effective way to evaluate the model's performance during training and testing.

In [11]:
def compute_accuracy(predictions, labels):
	return labels[predictions.ravel() < 0.5].mean()

This section sets up a Siamese Neural Network. It loads training and testing data from specified file paths, splits the inputs into two parts, and processes them through a shared network. The Euclidean distance is calculated to measure how similar the two inputs are. The model is then compiled with a loss function and optimizer, preparing it for training.

In [12]:
parser = argparse.ArgumentParser()
parser.add_argument("-X", "--path_Xtrain", help="Path X train")
parser.add_argument("-Y", "--path_Ytrain", help="Path Y train")
parser.add_argument("-x", "--path_xtest", help="Path x test")
parser.add_argument("-y", "--path_ytest", help="Path y test")


path_Xtrain = 'Xtrain.npy'
path_Ytrain = 'Ytrain.npy'
path_xtest = 'Xtest.npy'
path_ytest = 'Ytest.npy'

np.random.seed(9)
XtrainLeft, XtrainRigth, Ytrain, XtestLeft, XtestRigth, Ytest, longitud, dimension = LoadData(path_Xtrain, path_Ytrain, path_xtest, path_ytest)


Siamese = SiameseArquitecture(longitud, dimension)
input1 = Input(shape=(longitud,dimension))
input2 = Input(shape=(longitud,dimension))

brenchLeft = Siamese(input1)
brenchRight = Siamese(input2)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([brenchLeft, brenchRight])

rms = RMSprop()
model = Model([input1,input2], distance)
model.compile(loss='mean_squared_error', optimizer=rms)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


This section trains the Siamese Neural Network for 25 epochs, evaluating its performance after each epoch. It tracks training and testing accuracy using the `compute_accuracy` function and monitors progress through printed results. After completing the training, the model is saved to a file named `lstm_model.h5` for future use.

In [13]:
tracc, tsacc = [], []
trloss, tsloss = [], []
# Loop to evaluate 25 epochs
for i in range(25):
	print("->Epoch: ", i+1)
	history = model.fit([XtrainLeft, XtrainRigth], Ytrain,
	validation_data=([XtestLeft, XtestRigth],Ytest), epochs=1, batch_size=512)
	pred = model.predict([XtrainLeft, XtrainRigth])
	tr_acc = compute_accuracy(pred, Ytrain)
	pred = model.predict([XtestLeft, XtestRigth])
	te_acc = compute_accuracy(pred, Ytest)
	print("Train acc: ", tr_acc)
	print("Test acc: ", te_acc)

model.save('lstm_model.h5')

->Epoch:  1
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 12s/step - loss: 0.1574 - val_loss: 0.3315
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 170ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 174ms/step
Train acc:  0.9034021871202916
Test acc:  0.9040047114252061
->Epoch:  2
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 12s/step - loss: 0.0997 - val_loss: 0.3061
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 175ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 149ms/step
Train acc:  0.897208985704561
Test acc:  0.9050387596899225
->Epoch:  3
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 12s/step - loss: 0.0941 - val_loss: 0.2953
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 179ms/step
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 171ms/step
Train acc:  0.884643644379133
Test acc:  0.9079552925706772
->Epoch: 



Train acc:  0.013513513513513514
Test acc:  0.8928571428571429


In [60]:
# Texts for testing
text1 = "to fall gently from a small hotel he saw a couple and heard a hasty good night exchanged as they separated at the doorway in opposite directions the man who was dressed whistled for a passing cab and was driven away leaving his companion alone the woman walked slowly looking about as if expecting some one and presently a second man came out of the shadow of a building and walked up to her how much he asked holding out his hand four pounds was the answer in a disappointed voice i told you to insist on five said the man angrily he would not give it i did the best i could what an idiot you are was the response as the speaker took the money well i ve no time to lose i ll see you in the morning he was turning away apparently in great haste when the woman spoke again am i to walk home i haven t a shilling with an expression that sounded like a curse the man took a piece of silver from his pocket and a blunt refusal it into her hand the movement was so abrupt that the woman staggered with the force of it then he left her gray could not doubt the full significance of the proceeding he had witnessed the man was one of those creatures who live on the of women and he had taken from his mistress the for which she had just sacrificed her and her honor she had given it to him not unwillingly only making a slight protest when he left her to walk through the streets at one o clock at night for want of a cab fare it was a proceeding gray had heard of before but of which ha had always entertained some doubt stunned by the scene he was walking on in a sort of when the woman who had no suspicion that her recent actions had been noticed by him hastened on his track it is a cold night sir she said he walked faster but she caught up with him and for nearly a minute into his ears invitations that filled him with horror at last he saw a policeman in the distance and stopping short threatened to give her into if she did not leave him at once the startled look of the woman when he uttered these words made him sorry that he had spoken so sharply he drew out a half sovereign though he had little enough to spare and told her to take it she reached for the money with an that was astonishing and only to be accounted for by hunger or some equally strong reason then with a searching look at his face she vanished up one of the streets love gone astray and gray went back to his temporary lodging wondering if he could trust his eyes and his ears for the man he had seen lying in wait for this poor creature and taking her shameful to the last penny was none other than the one he had rescued in and encountered again in rome the elegant rider he so recently met in the park his old acquaintance chapter ix shall it be you or he f nature had its way with him before lie was aware of it and he fell into a deep sleep when he awoke it was past eight o clock and the room was brightly illuminated with the wintry sun he rubbed his eyes as he realized what must have occurred and was about to go to the street to summon a cab when he heard a familiar voice in the outer entry how early do you expect mr asked the voice about half past nine was the reply of the domestic addressed i must see him sooner said the first voice give me his house address he never business at his house said the domestic and i have orders to give the address to no one shall it be you or he there was an angry and impatient exclamation at this i shall find it in some way said the stranger if i do not i shall be here again within an hour would the gentleman leave his card no the gentleman would not the gentleman in an ill temper and in great haste for he went out with a of the door that shook the building mr gray had risen from the chair in which he slept and stood staring in the direction from which these sounds proceeded the voice he had heard was a familiar one good heaven how could that man appear at every turn in his path it was tolerably clear was one of those eligible young gentlemen whom mr had selected as possible partners for his fair such a fate for the poor girl was too horrible gray seized a sheet of note paper and wrote rapidly dear mr see me without fail before you make any arrangement in the matter of which we were speaking i shall wait in my room till you come do not fail to heed this yours g g take that as fast as you can ride to mr he said when the domestic answered his bell put it into his hands yourself it was only half an hour before the domestic returned bringing the message that mr would be at his office as soon as possible love gone and fretted however as the time dragged on he had come to feel that the of a crime lay in his hands the character of had presented itself to him so that he could not endure the thought that a young girl of the sort the had described of her one fault should be condemned to such a life it would be even better to endure the pangs of to face the cruel world with her guilt exposed than to marry such a man  "  # Author 1
text2 = "she thought feathers the only proper thing what a wretch he was to make her come into that cold room when she was not used to it just to prove that he could have his way would "  # Author 1
text3 = "but it was more and his progress through it was more laborious the wind too which came roaring down from old in the over the broad open surface of the pond made it very hard for him to struggle on he succeeded however at length in fairly gaining the opposite shore without actually losing his track and then after a short walk in a sheltered valley he turned out of the road into the doctor s yard and up to his door and now since he is safely there we will return to george and mary george went back to his shop to finish george and mary in his cradle mary s great anxiety his work promising to return again then and take care of while mary prepared supper he accordingly came in again after half an hour looking up anxiously as he crossed the yard at the signs of increasing violence in the storm as he entered mary was rocking and he came and took her place he had made himself a expressly for the purpose of rocking it was like any ordinary chair except in height the seat being only about eight inches from the floor to accommodate it to his stature he drew this chair up to the side of the cradle said he looking into the cradle and holding out his hands to the little sufferer want to come and rock with father made an effort to reach out his hands but from weakness they dropped back again at his side george took him up gently and laying he child s face upon his shoulder murmured words of sympathy and in his ear mary went to the window oh my george said she what a storm poor he never will get across the pond how could we let him go we did the best we could mary and now you must not make yourself and me anxious and unhappy about it why how can i help feeling anxious said she my poor boy out on a lone road in such a storm as this and night coming on we can help feeling anxious in a measure replied george we can try to think of something else and if an anxious thought comes into your mind don t speak it out speaking it out makes it stronger the child is in god s hands and we have now nothing to do for him mary could not reply to this and she went about her b submission mary makes a torn over work preparing supper but her mind was ill at ease she could not deny george s position that their boy was entirely out of their hands and that god by making it plainly their duty to send him at least as it appeared to them had taken the responsibility of his safety into his own hands but yet after all her heart was not she could not let him go and feel that she had no to do but to await the decision of another george too felt an instinctive parental solicitude which made him follow in imagination every step of s way but his heart was subdued and to the will of god in regard to the result so that he was calm and peaceful in spirit though the swelling emotions of his heart repeatedly filled his eyes he in s ear in words too imperfectly to be heard the good old hymn lift mine eyes from god is all my aid and they who know by experience what it is really to resign every thing into god s hands in an hour of serious danger or trouble will not think it strange that he spent half an hour in a state of very pure and enjoyment in the mean time mary was busy in her preparations for supper and particularly in making a little apple turn over for against he came back an apple turn over was s highest idea of luxury and mary by her interest in making it got over another half hour very well the time however soon arrived when she began to listen for the doctor s bells she began to listen for them a full quarter of an hour before they could have been reasonably expected but this quarter of an hour glided away very soon and the daylight began sensibly to decline she left her work repeatedly to go to the window and look out mary becomes impatient submission impossible george anxiously at last she asked george if it was not time for them to come why no said george hesitating i should hardly expect them yet it is two hours and more already and it is growing dark mary brought her face close to the glass her eyes from the light in the room by putting her hands upon each side of them and straining her sight to look down the road but the snow which filled the air and drove against the window and down on the outside prevented her seeing much i do not believe it is possible for the poor little fellow to get across the pond in such a night as this well mary we have nothing to do but to wait quietly for the end now there is nothing we can do and it is wrong to be restless and anxious about it oh dear said mary sitting down and gazing into the fire with a look of great distress how sorry i am we let him go might have gone and now he will perish in the snow and i shall never have another moment s peace as long as i live but consider mary said george we have done the best we could and he is in god s hands you are not willing to leave him there oh george said she it is too dreadful she rose and walked back and forth across the room with a hurried and "  # Author 2

In [66]:
def predict_author_similarity(model, tokenizer, text1, text2, max_length=800):
    """
    Predict if two given texts are written by the same author.

    Args:
        model: Trained model.
        tokenizer: Tokenizer used during training.
        text1 (str): First text input.
        text2 (str): Second text input.
        max_length (int): Maximum sequence length for padding.

    Returns:
        str: "Same author" or "Different authors".
    """
    # Tokenize and pad the texts
    seq1 = tokenizer.texts_to_sequences([text1])[0]
    seq2 = tokenizer.texts_to_sequences([text2])[0]
    seq1 = pad_sequences([seq1], maxlen=max_length)
    seq2 = pad_sequences([seq2], maxlen=max_length)

    # Predict using the two inputs separately
    prediction = model.predict([seq1, seq2], verbose=0)[0][0]  # Get the prediction (distance)


    print(f"Prediction : {prediction}")
    if prediction > 0.5:
        return "Same author"
    else:
        return "Different authors"


In [67]:
print("Text1 vs Text2:", predict_author_similarity(model, tokenizer, text1, text2))
print("Text1 vs Text3:", predict_author_similarity(model, tokenizer, text1, text3))

Prediction : 0.5954779982566833
Text1 vs Text2: Same author
Prediction : 0.47985658049583435
Text1 vs Text3: Different authors
