In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import sqlalchemy
from transformers import pipeline, RobertaTokenizer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import language_tool_python
import torchtext
from torchtext.data import get_tokenizer
sys.path.append('../')
from credentials import credentials
tqdm.pandas()

In [4]:
connector_string = f'mysql+mysqlconnector://{credentials["user"]}:{credentials["password"]}@{credentials["host"]}/AuthenticAI'
db_engine = sqlalchemy.create_engine(connector_string,echo=True)
db_conn = db_engine.connect()

TypeError: 'Credentials' object is not subscriptable

In [None]:
student_written_count = [i[0] for i in db_conn.execute(sqlalchemy.text('select count(*) from essays where essays.LLM_written = 0;'))][0]
llm_written_count = [i[0] for i in db_conn.execute(sqlalchemy.text('select count(*) from essays where essays.LLM_written = 1;'))][0]

In [3]:
classes = ['human Written','LLM Written']
data = [student_written_count, llm_written_count]
plt.pie(x=data,labels=classes,autopct='%.0f%%')
plt.title('Percentage of Data per Class')
plt.show()

NameError: name 'student_written_count' is not defined

In [None]:
word_counts = pd.DataFrame([i for i in db_conn.execute(sqlalchemy.text('select word_count, LLM_written from essays;'))])

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data=word_counts,x='word_count',hue='LLM_written')
plt.title('Box Plot of Word Counts for Each Class')
plt.show()

In [None]:
plt.title('Distribution of Word Counts for Each Class')
sns.histplot(data=word_counts,x='word_count',hue='LLM_written')
plt.show()

In [None]:
print('Student Written Essay Descriptive Statistics')
print(word_counts[word_counts['LLM_written'] == 0]['word_count'].describe())
print()
print('LLM Written Essay Descriptive Statistics')
print(word_counts[word_counts['LLM_written'] == 1]['word_count'].describe())

In [None]:
detector = pipeline("text-classification","roberta-base-openai-detector")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
text_and_labels = pd.DataFrame([i for i in db_conn.execute(sqlalchemy.text('select essay, LLM_written from essays;'))])

In [None]:

def num_of_tokens(text:str) -> int:
    tokenized_text = tokenizer(text)['input_ids']
    return len(tokenized_text)

In [None]:

text_and_labels['token_count'] = text_and_labels['essay'].progress_apply(num_of_tokens)
text_and_labels.head()

In [None]:
valid_examples = text_and_labels[text_and_labels['token_count'] <= 512]
valid_examples.head()

In [None]:
valid_examples.shape

In [None]:

_, sample = train_test_split(valid_examples,test_size=1000,random_state=42,shuffle=True,stratify=valid_examples['LLM_written'])
sample['LLM_written'].value_counts()

In [None]:

predictions = detector(sample['essay'].tolist())

In [None]:

pred_list = [0 if pred['label'] == 'Real' else 1 for pred in predictions]
sample['predictions'] = pred_list
sample.head()

In [None]:

accuracy = accuracy_score(sample['LLM_written'],sample['predictions'])
print(f'Accuracy: {accuracy * 100}%')

In [None]:

matrix = confusion_matrix(sample['LLM_written'],sample['predictions'])
display = ConfusionMatrixDisplay(matrix)
display.plot()
plt.show()

In [None]:

print(f'P(Y = 0 | X = student written essay) = {matrix[0][0] / (matrix[0][0] + matrix[0][1])}')
print(f'P(Y = 1 | X = LLM written essay) = {matrix[1][1] / (matrix[1][1] + matrix[1][0])}')

In [None]:
tool = language_tool_python.LanguageTool('en-US',config={'cacheSize': 1000})

In [None]:
def grammer_error_count(text:str) -> int:
    errors = tool.check(text)
    return len(errors)

In [None]:
text_and_labels = pd.DataFrame([i for i in db_conn.execute(sqlalchemy.text('select essay, LLM_written from essays;'))])
text_and_labels.head()

In [None]:
_, sample = train_test_split(text_and_labels,test_size=5000,random_state=42,shuffle=True,stratify=text_and_labels['LLM_written'])
sample['LLM_written'].value_counts()

In [None]:
sample['grammar_error_count'] = sample['essay'].progress_apply(grammer_error_count)

In [None]:
student = sample[sample['LLM_written'] == 0]
llm_written = sample[sample['LLM_written'] == 1]

In [None]:
print('Student Grammatical Errors')
print(student['grammar_error_count'].describe())
print()
print('LLM Grammatical Errors')
print(llm_written['grammar_error_count'].describe())

In [None]:
plt.title('Average Number of Grammar Mistakes per Class')
plot = sns.barplot(data=sample,x='LLM_written',y='grammar_error_count',errorbar=None)
for i in plot.containers:
    plot.bar_label(i,)
plt.show()

In [None]:
db_conn.close()
db_engine.dispose()