In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import sqlalchemy
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from torchtext.data import get_tokenizer
from torchtext.data.utils import ngrams_iterator
import spacy
from transformers import pipeline, AutoTokenizer
from sklearn.model_selection import train_test_split

sys.path.append('../')
from credentials import credentials

tqdm.pandas()

In [2]:

connector_string = f'mysql+mysqlconnector://{credentials["user"]}:{credentials["password"]}@{credentials["host"]}/AuthenticAI'
db_engine = sqlalchemy.create_engine(connector_string,echo=True)

db_conn = db_engine.connect()

TypeError: 'Credentials' object is not subscriptable

In [None]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')

In [None]:
def get_unique_words(text:str) -> int:
    tokenized = set(tokenizer.tokenize(text))
    return len(tokenized)

In [None]:
data = pd.DataFrame([row for row in db_conn.execute(sqlalchemy.text('select * from essays;'))])
data

In [None]:
data['unique_word_count'] = data['essay'].progress_apply(get_unique_words)

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data,x='unique_word_count',hue='LLM_written')
plt.title('Box Plot of Unique Word Counts for Each Class')
plt.show()

In [None]:
print('Student Unique Words')
print(data[data['LLM_written'] == 0]['unique_word_count'].describe())
print()
print('LLM Unique Words')
print(data[data['LLM_written'] == 1]['unique_word_count'].describe())

In [None]:
data['unique_to_total'] = data['unique_word_count'] / data['word_count']

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data,x='unique_to_total',hue='LLM_written')
plt.title('Box Plot of Unique Word Counts to Total Words Ratio for Each Class')
plt.show()

In [None]:
print('Student Unique Words to Total')
print(data[data['LLM_written'] == 0]['unique_to_total'].describe())
print()
print('LLM Unique Word to Total')
print(data[data['LLM_written'] == 1]['unique_to_total'].describe())

In [None]:
smaller_word_count = data[data['word_count'] <= 400]
smaller_word_count['LLM_written'].value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(smaller_word_count,x='unique_word_count',hue='LLM_written')
plt.title('Box Plot of Unique Word Counts for Each Class for Essays <= 400 words')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(smaller_word_count,x='unique_to_total',hue='LLM_written')
plt.title('Box Plot of Unique Word Counts to Total Word Counts for Each Class for Essays <= 400 words')
plt.show()

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
def stop_word_count(text:str) -> int:
    tokenized = tokenizer.tokenize(text)
    count = 0

    for word in tokenized:
        if word in stop_words:
            count += 1
    
    return count

In [None]:
data['stop_word_count'] = data['essay'].progress_apply(stop_word_count)

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data,x='stop_word_count',hue='LLM_written')
plt.title('Box Plot of Stop Words')
plt.show()

In [None]:
print('Student Stop Words')
print(data[data['LLM_written'] == 0]['stop_word_count'].describe())
print()
print('Student Stop Word')
print(data[data['LLM_written'] == 1]['stop_word_count'].describe())

In [None]:
data['stop_word_ratio'] = data['stop_word_count'] / data['word_count']

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data,x='stop_word_ratio',hue='LLM_written')
plt.title('Box Plot of Stop Word Ratio')
plt.show()

In [None]:
smaller_word_count = data[data['word_count'] <= 400]
smaller_word_count['LLM_written'].value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(smaller_word_count,x='stop_word_count',hue='LLM_written')
plt.title('Box Plot of Stop Words for Essays less than 400 words')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(smaller_word_count,x='stop_word_ratio',hue='LLM_written')
plt.title('Box Plot of Stop Words/Total Words for Essays less than 400 words')
plt.show()

In [None]:
pytorch_tokenizer = get_tokenizer('spacy',language='en_core_web_sm')

In [None]:

def count_punc(text: str) -> int:
    tokenized_text = pytorch_tokenizer(text)
    count_q = 0
    count_ex = 0
    count_semi = 0
    count_col = 0
    for token in tokenized_text:
        if token == "?":
            count_q += 1
        elif token == "!":
            count_ex += 1
        elif token == ";":
            count_semi += 1
        elif token == ":":
            count_col += 1
    
    return count_q, count_ex,count_semi, count_col

In [None]:
counts = data['essay'].progress_apply(count_punc)
data['count_question'] = [row[0] for row in counts]
data['count_exclamation'] = [row[1] for row in counts]
data['count_semi'] = [row[2] for row in counts]
data['count_colon'] = [row[3] for row in counts]
data.head()

In [None]:
print('Student')
print(data[data['LLM_written'] == 0][['count_question','count_exclamation','count_semi','count_colon']].describe())
print()
print('LLM')
print(data[data['LLM_written'] == 1][['count_question','count_exclamation','count_semi','count_colon']].describe())

In [None]:

unigrams = {}
tokenized_essays = data['essay'].progress_apply(lambda row: pytorch_tokenizer(row))

In [None]:
unigrams = {'student':{},'llm':{}}
labels = data['LLM_written'].tolist()
for index in tqdm(range(len(labels))):
    if labels[index] == 0:
        label = 'student'
    else:
        label = 'llm'
    for token in tokenized_essays[index]:
        if token in unigrams[label].keys():
            count = unigrams[label][token] + 1
            unigrams[label][token] = count
        else:
            unigrams[label][token] = 1

In [None]:
unigrams_df = pd.DataFrame.from_dict(unigrams).fillna(value=0)
unigrams_df['student_dom'] = unigrams_df['student'] - unigrams_df['llm']
unigrams_df['llm_dom'] = unigrams_df['llm'] - unigrams_df['student']
unigrams_df.sort_values(by='student_dom',ascending=False).head(20)

In [None]:
unigrams_df.sort_values(by='llm_dom',ascending=False).head(20)

In [None]:
tokenized_essays_bigrams = []
for essay in tqdm(tokenized_essays):
    tokenized_essays_bigrams.append(list(ngrams_iterator(essay,2))[len(essay):])

In [None]:
bigrams = {'student':{},'llm':{}}
labels = data['LLM_written'].tolist()
for index in tqdm(range(len(labels))):
    if labels[index] == 0:
        label = 'student'
    else:
        label = 'llm'
    for token in tokenized_essays_bigrams[index]:
        if token in bigrams[label].keys():
            count = bigrams[label][token] + 1
            bigrams[label][token] = count
        else:
            bigrams[label][token] = 1

In [None]:
bigrams_df = pd.DataFrame.from_dict(bigrams).fillna(value=0)
bigrams_df['student_dom'] = bigrams_df['student'] - bigrams_df['llm']
bigrams_df['llm_dom'] = bigrams_df['llm'] - bigrams_df['student']
bigrams_df.sort_values(by='student_dom',ascending=False).head(20)

In [None]:
bigrams_df.sort_values(by='llm_dom',ascending=False).head(20)

In [None]:
tokenized_essays_trigrams = []
for essay in tqdm(tokenized_essays):
    tokenized_essays_trigrams.append(list(ngrams_iterator(essay,3))[len(essay)*2-1:])

In [None]:
trigrams = {'student':{},'llm':{}}
labels = data['LLM_written'].tolist()
for index in tqdm(range(len(labels))):
    if labels[index] == 0:
        label = 'student'
    else:
        label = 'llm'
    for token in tokenized_essays_trigrams[index]:
        if token in trigrams[label].keys():
            count = trigrams[label][token] + 1
            trigrams[label][token] = count
        else:
            trigrams[label][token] = 1

In [None]:
trigrams_df = pd.DataFrame.from_dict(trigrams).fillna(value=0)
trigrams_df['student_dom'] = trigrams_df['student'] - trigrams_df['llm']
trigrams_df['llm_dom'] = trigrams_df['llm'] - trigrams_df['student']
trigrams_df.sort_values(by='student_dom',ascending=False).head(20)

In [None]:
trigrams_df.sort_values(by='llm_dom',ascending=False).head(20)

In [None]:
model_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
def num_of_tokens(text:str) -> int:
    tokenized_text = model_tokenizer(text)['input_ids']
    return len(tokenized_text)

In [None]:
data['token_count'] = data['essay'].progress_apply(num_of_tokens)

In [None]:
valid_examples = data[data['token_count'] <= 512]

In [None]:
_, sample = train_test_split(valid_examples,test_size=1000,random_state=42,shuffle=True,stratify=valid_examples['LLM_written'])
sample['LLM_written'].value_counts()

In [None]:
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

In [None]:
emotion_predictions = []
for essay in tqdm(sample['essay']):
    emotion_predictions.append(classifier(essay))

In [None]:
sample['emotion_pred'] = [exam['label'] for exam in [example[0] for example in emotion_predictions]]
sample.head()

In [None]:
def llm_written_cat(label:int) -> str:
    if label == 1:
        return 'LLM'
    else:
        return 'student'
sample['LLM_written_cat'] = sample['LLM_written'].progress_apply(llm_written_cat)        

In [None]:
plt.title('Emotion Prediction Per Class')
plot = sns.countplot(sample,x='LLM_written_cat',hue='emotion_pred')
for i in plot.containers:
    plot.bar_label(i,)
plt.show()

In [None]:
probs_given_student = sample[sample['LLM_written'] == 0]['emotion_pred'].value_counts() / sample[sample['LLM_written'] == 0].shape[0]
probs_given_student

In [None]:
probs_given_llm = sample[sample['LLM_written'] == 1]['emotion_pred'].value_counts() / sample[sample['LLM_written'] == 1].shape[0]
probs_given_llm

In [None]:
sample_probs = sample['emotion_pred'].value_counts() / sample.shape[0]
sample_probs

In [None]:
total_probs = sample['LLM_written'].value_counts() / sample.shape[0]
student_given_emotion = (probs_given_student * total_probs[0]) / sample_probs 
llm_given_emotion = probs_given_llm * total_probs[1] / sample_probs 
student_given_emotion

In [None]:
llm_given_emotion

In [None]:
db_conn.close()
db_engine.dispose()