In [1]:
import numpy as np
import nltk
import string
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict, stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
import os

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def Get_Sty_Features(text):
    # avg word len
    tokens = nltk.word_tokenize(text)
    total_chars = sum(len(word) for word in tokens)
    average_word_length = total_chars / len(tokens)
    # avg sentence len
    sentences = nltk.sent_tokenize(text)
    total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
    average_sentence_length = total_words / len(sentences)
    # para len by chars
    paragraph_length_chars = len(text)
    # para len by tokens
    paragraph_length_words = len(tokens)
    # para len by sentences
    paragraph_length_sents = len(sentences)
    # Type-Token Ratio
    total_tokens = len(tokens)
    unique_types = len(set(tokens))
    type_token_ratio = unique_types / total_tokens
    # avg syllables
    cmud = cmudict.dict()
    total_syllables = sum([len(cmud.get(word.lower(), [[None]])[0]) for word in tokens])
    average_syllables_per_word = total_syllables / len(tokens)
    # Flesch-Kincaid readability score
    flesch_reading_ease = 206.835 - (1.015 * average_sentence_length) - (84.6 * average_syllables_per_word)
    # Stopwords Count
    stopwords_list = stopwords.words("english")
    stopwords_count = len([token for token in tokens if token.lower() in stopwords_list])
    # Function words Count
    function_words = nltk.pos_tag(tokens)
    function_words_count = len([word for word, pos in function_words if pos.startswith("FW")])
    # Punctuation Marks Ratio
    punctuation_count = sum([1 for token in tokens if token in string.punctuation])
    punctuation_ratio = punctuation_count / len(tokens)
    return [average_word_length, average_sentence_length, paragraph_length_chars, 
            paragraph_length_words, paragraph_length_sents, type_token_ratio, 
            average_syllables_per_word, flesch_reading_ease, stopwords_count, 
            function_words_count, punctuation_ratio]

    


In [4]:
training_path2 = "./trainingFea.pth"
val_path2 = "./ValFea.pth"

In [75]:
LR = LogisticRegression(max_iter=100000)

In [70]:
a = torch.load(training_path2)
X = a['features']
Y = a['labels']
X = [t.tolist() for t in X]
Y = [t.item() for t in Y]

In [76]:
LR.fit(X,Y)

In [77]:
a = torch.load(val_path2)
X = a['features']
Y = a['labels']
X = [t.tolist() for t in X]
Y = [t.item() for t in Y]

In [78]:
y = LR.predict(X)
y

array([0, 1, 1, ..., 1, 0, 1])

In [79]:
acc = accuracy_score(Y, y)
f1 = f1_score(Y, y, average='binary')
print(f"F1 score: {f1:.4f}, acc_para: {acc:.4f}")

F1 score: 0.4390, acc_para: 0.5828
