In [None]:
!pip install textstat



In [93]:
import pandas as pd
import tensorflow as tf

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams

import zlib
import math
import textstat
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report, cohen_kappa_score
from scipy.special import expit

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def punct(document):
    """
    Creates set of counts of punctuation marks
    :param document:
    :return:
    """
    punct_counts = [document.count("!") / len(document), document.count("?") / len(document),
                    document.count(";") / len(document), document.count(":") / len(document),
                    document.count(",") / len(document), document.count(".") / len(document),
                    document.count("-") / len(document), document.count("'") / len(document)]
    return punct_counts

def punct_similarity(text1, text2):
    punct1 = punct(text1)
    punct2 = punct(text2)
    return cosine_similarity([punct1], [punct2])[0][0]

def compression_based_dissimilarity(doc1, doc2, encoding="utf-8"):
    """
    Get the CDM score (Compression-based Dissimilarity Method) for two documents:
      CDM(x, y) =  (C(x) + C(y)) / C(xy)
      (see Zhensi-Li p. 19)

    Compression algorithm is zlib's gzip.

    :param doc1: the first document (as string)
    :param doc2: the second document (as string)
    :return: the CDM score
    """
    bytes_doc1 = bytes(doc1, encoding)
    bytes_doc2 = bytes(doc2, encoding)
    return (len(zlib.compress(bytes_doc1)) / len(bytes_doc1)) + (len(zlib.compress(bytes_doc2)) / len(bytes_doc2)) / (len(zlib.compress(bytes_doc1 + bytes_doc2)) / len(bytes_doc1 + bytes_doc2))

def char_ngram_similarity(doc1, doc2, n, top=100):
    """
    Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
    If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
    :param doc1:
    :param doc2:
    :param n: the n-gram length
    :param top: Only use the N most frequent n-grams from each document.
    :return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
             character n-grams distribution.)
    """

    ngrams1 = Counter(ngrams(doc1, n))
    ngrams2 = Counter(ngrams(doc2, n))

    profile1 = [n[0] for n in ngrams1.most_common(top)]
    profile2 = [n[0] for n in ngrams2.most_common(top)]

    # normalise the two ngram distributions
    total1 = np.sum(list(ngrams1.values()))
    for key in ngrams1:
        ngrams1[key] /= total1

    total2 = np.sum(list(ngrams2.values()))
    for key in ngrams2:
        ngrams2[key] /= total2

    # calculate global dissimilarity score
    score = 0
    for n in set(profile1 + profile2):
        f1 = ngrams1[n]
        f2 = ngrams2[n]
        score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
    return expit(score)

def upper_per_lower(text1, text2):
  """
  Gives a ratio between the number of uppercase characters and lower case characters in the two texts.
  :param text1:
  :param text1:
  :return:
  """

  ratio1 = (len([i for i in text1 if i.isupper()]) + 1e-5) / (len([i for i in text1 if i.islower()]) + 1e-5)
  ratio2 = (len([i for i in text2 if i.isupper()]) + 1e-5) / (len([i for i in text2 if i.islower()]) + 1e-5)
  return comparison(ratio1, ratio2)
def entropy(tokens):
    """
    Get the Shannon entropy of a document using it's token distribution
    :param tokens: A document represented as a list of tokens.
    :return:
    """
    doc_len = len(tokens)
    frq = FreqDist(tokens)
    for key in frq.keys():
        frq[key] /= doc_len
    ent = 0.0
    for key in frq.keys():
        ent += frq[key] * math.log(frq[key], 2)
    ent = -ent
    return ent

def comparison(num1, num2):
  return (num1 - num2 + 1e-5) / (abs(num1) + abs(num2) + 1e-5)


In [75]:
def extract_text_features(text1, text2):
  text1 = str(text1)
  text2 = str(text2)

  num_words1 = len(text1.split())
  num_words2 = len(text2.split())
  features = {
      'length': comparison(len(text1), len(text2)),
      'num_words': comparison(num_words1, num_words2),
      # 'num_sentences': comparison(len(text1.split(".")), len(text2.split("."))),
      'avg_word_length': comparison((sum(len(word) for word in text1.split()) / len(text1.split())) , (sum(len(word) for word in text2.split()) / len(text2.split()))),
      "flesch_kincaid_grade": comparison(textstat.flesch_kincaid_grade(text1), textstat.flesch_kincaid_grade(text2)),
      # "coleman_liau_index": textstat.coleman_liau_index(text1) - textstat.coleman_liau_index(text2),
      # 'automated_readability_index': comparison(textstat.automated_readability_index(text1), textstat.automated_readability_index(text2)),  # Automated Readability Index
      # 'ngram_dissimiarlarity' : char_ngram_similarity(text1, text2, 3),
      # 'compression_dissimiarlarity' : compression_based_dissimilarity(text1, text2),
      'upper_per_lower' : upper_per_lower(text1, text2),
      # 'entropy' : comparison(entropy(text1), entropy(text2)),
      'punctuation' : punct_similarity(text1, text2),
      # Add more textstat
      }

    # Lexical features using NLTK
  tokens1 = word_tokenize(text1)
  fdist1 = FreqDist(tokens1)
  tokens2 = word_tokenize(text2)
  fdist2 = FreqDist(tokens2)
  features['unique_words'] = comparison(len(fdist1), len(fdist2))  # Hapax Legomena (unique words)
  features['vocab_richness'] =  comparison((len(fdist1) / num_words1), (len(fdist2)/num_words2))  # Type-Token Ratio

  # Stylometric features
  sentence_lengths1 = [len(s) for s in text1.split(".")]
  sentence_lengths2 = [len(s) for s in text2.split(".")]
  features['sentence_length_std'] = comparison((np.std(sentence_lengths1) if sentence_lengths1 else 0), (np.std(sentence_lengths2) if sentence_lengths2 else 0))

  # Additional feature - Average character length per word
  features['avg_char_per_word'] = comparison((sum(len(word) for word in tokens1) / num_words1), (sum(len(word) for word in tokens2) / num_words2))

  return list(features.values())


In [78]:
# Data preprocessing functions
max_features = 15000
max_length = 512

def clean_text(text):
  """
  Cleans text data by converting to lowercase, removing punctuation,
  and removing stop words (optional).
  """
  if type(text) == str:
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])  # Remove punctuation
    return text
  return ""

def preprocess_data(texts, tokenizer):
  """
  Preprocesses text data by cleaning, tokenizing, and padding sequences.
  """
  cleaned_texts = [clean_text(text) for text in texts]  # Clean text
  if not tokenizer:
    tokenizer = Tokenizer(num_words=max_features)  # Create tokenizer
  tokenizer.fit_on_texts(cleaned_texts)  # Fit tokenizer on text data
  sequences = tokenizer.texts_to_sequences(cleaned_texts)  # Convert text to sequences
  padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')  # Pad sequences

  return padded_sequences, tokenizer  # Return padded sequences and tokenizer for encoding unseen text



In [81]:
test_df = pd.read_csv("test.csv", encoding='utf-8', dtype=str)
val_df = pd.read_csv("dev.csv", encoding='utf-8')

val_texts1, val_texts2, val_labels = val_df["text_1"].tolist(), val_df["text_2"].tolist(), val_df["label"].tolist()  # Load validation data
test_texts1, test_texts2 = test_df["text_1"].tolist(), test_df["text_2"].tolist()  # Load test data

# Preprocess training, validation, and test data
val_data1, tokenizer = preprocess_data(val_texts1, None)
val_data2, tokenizer = preprocess_data(val_texts2, tokenizer)  # Reuse tokenizer

test_data1, tokenizer = preprocess_data(test_texts1, tokenizer)
test_data2, tokenizer = preprocess_data(test_texts2, tokenizer)  # Reuse tokenizer

test_df['text_features'] = test_df.apply(lambda row: extract_text_features(row['text_1'], row['text_2']), axis=1)
test_text_features = pd.DataFrame(test_df['text_features'].to_list())

val_df['text_features'] = val_df.apply(lambda row: extract_text_features(row['text_1'], row['text_2']), axis=1)
val_text_features = pd.DataFrame(val_df['text_features'].to_list())


In [103]:
model_A = tf.keras.models.load_model('best_model_A.keras')
model_B = tf.keras.models.load_model('best_model_B.h5')

# Evaluation for Model A

In [104]:
predictions = model_A.predict(val_text_features)
binary_predictions = (predictions > 0.5).astype(int)

y_true = val_df['label']
y_pred = binary_predictions

# Evaluate model performance (accuracy)
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

# # Evaluate model performance (confusion matrix and F1-score)
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", conf_matrix)

report = classification_report(y_true, y_pred, output_dict=True)

# Access precision and recall for each class
precision_class_0 = report['0']['precision']
recall_class_0 = report['0']['recall']
precision_class_1 = report['1']['precision']
recall_class_1 = report['1']['recall']

print("\nPrecision (Class 0):", precision_class_0)
print("Recall (Class 0):", recall_class_0)
print("Precision (Class 1):", precision_class_1)
print("Recall (Class 1):", recall_class_1)

f1 = f1_score(y_true, y_pred)
print("\nF1-score:", f1)

# MCC
tn, fp, fn, tp = conf_matrix.ravel()
numerator = tp * tn - fp * fn
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
mcc = numerator / (denominator + np.spacing(1))
print("\nMCC:", mcc)

# Cohen's Kappa
kappa = cohen_kappa_score(y_true, y_pred)
print("\nCohen's Kappa:", kappa)



Accuracy: 0.5818333333333333
Confusion Matrix:
 [[1931 1058]
 [1451 1560]]

Precision (Class 0): 0.5709639266706091
Recall (Class 0): 0.6460354633656742
Precision (Class 1): 0.5958747135217723
Recall (Class 1): 0.5181002989040187

F1-score: 0.5542725173210161

MCC: 0.16548168292598286

Cohen's Kappa: 0.16405695918416763


# Evaluation for Model B

In [105]:
predictions = model_B.predict([np.array(val_data1), np.array(val_data2)])
binary_predictions = (predictions > 0.5).astype(int)

y_true = val_df['label']
y_pred = binary_predictions

# Evaluate model performance (accuracy)
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

# Evaluate model performance (confusion matrix and F1-score)
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", conf_matrix)

report = classification_report(y_true, y_pred, output_dict=True)

# Access precision and recall for each class
precision_class_0 = report['0']['precision']
recall_class_0 = report['0']['recall']
precision_class_1 = report['1']['precision']
recall_class_1 = report['1']['recall']

print("\nPrecision (Class 0):", precision_class_0)
print("Recall (Class 0):", recall_class_0)
print("Precision (Class 1):", precision_class_1)
print("Recall (Class 1):", recall_class_1)

f1 = f1_score(y_true, y_pred)
print("\nF1-score:", f1)

# MCC
tn, fp, fn, tp = conf_matrix.ravel()
numerator = tp * tn - fp * fn
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
mcc = numerator / (denominator + np.spacing(1))
print("\nMCC:", mcc)

# Cohen's Kappa
kappa = cohen_kappa_score(y_true, y_pred)
print("\nCohen's Kappa:", kappa)



Accuracy: 0.5195
Confusion Matrix:
 [[1358 1631]
 [1252 1759]]

Precision (Class 0): 0.5203065134099617
Recall (Class 0): 0.45433255269320844
Precision (Class 1): 0.5188790560471976
Recall (Class 1): 0.584191298571903

F1-score: 0.5496016247461335

MCC: 0.038853301655152814

Cohen's Kappa: 0.038541704879325844


# Predictions for Model A

In [106]:
predictions = model_A.predict(test_text_features)
binary_predictions = (predictions > 0.5).astype(int)

output_df = pd.DataFrame(binary_predictions, columns=["prediction"])
print(output_df)
output_df.to_csv("Group_51_A.csv", index=False)

      prediction
0              1
1              0
2              0
3              1
4              0
...          ...
5995           0
5996           1
5997           0
5998           0
5999           1

[6000 rows x 1 columns]


# Predictions for Model B

In [109]:
predictions = model_B.predict([np.array(test_data1), np.array(test_data2)])
binary_predictions = (predictions > 0.5).astype(int)

output_df = pd.DataFrame(binary_predictions, columns=["prediction"])
print(output_df)
output_df.to_csv("Group_51_B.csv", index=False)

      prediction
0              0
1              0
2              1
3              1
4              0
...          ...
5995           1
5996           1
5997           1
5998           1
5999           1

[6000 rows x 1 columns]
