In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import xgboost as xgb
nltk.download('cmudict')
print('****Import Complete****')

****Import Complete****


[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/hunterlanier/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [36]:
# Merging Data Sets
human_data = pd.read_csv('../Data/train_data.csv')
gpt3_data = pd.read_csv('../Data/essays_gpt3.5.csv') 
gpt4_data = pd.read_csv('../Data/essays_gpt4.csv')
df = pd.concat([human_data,gpt3_data,gpt4_data])
df = df.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv('../Challenge/test_essays.csv')
df_test = df_test.sample(frac=1).reset_index(drop=True)


# Readability Functions

In [37]:

# Syllable count in a word
def syllable_count(word, d):
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        # Fallback method for words not in cmudict
        return len(re.findall(r'[aeiouy]+', word.lower()))
# Initialize CMU Pronouncing Dictionary
d = cmudict.dict()

# Helper function to preprocess text and count syllables, words, and sentences
def preprocess_and_count(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    syllables = sum(syllable_count(word, d) for word in words)
    num_sentences = len(sentences)
    num_words = len([word for word in words if word.isalpha()])
    num_syllables = syllables
    num_chars = sum(len(word) for word in words if word.isalpha())
    return num_sentences, num_words, num_syllables, num_chars


# Flesch Reading Ease
def flesch_reading_ease(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    return 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)

# Flesch-Kincaid Grade Level
def flesch_kincaid_grade(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    return 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59

# Gunning Fog Index
def gunning_fog(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    complex_words = sum(syllable_count(word, d) >= 3 for word in word_tokenize(text))
    return 0.4 * ((num_words / num_sentences) + 100 * (complex_words / num_words))

# SMOG Index
def smog_index(text):
    num_sentences, _, num_syllables, _ = preprocess_and_count(text)
    return 1.043 * (30 * (num_syllables / num_sentences))**0.5 + 3.1291

# Automated Readability Index
def automated_readability_index(text):
    num_sentences, num_words, _, num_chars = preprocess_and_count(text)
    return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43

# Coleman-Liau Index
def coleman_liau_index(text):
    num_sentences, num_words, _, num_chars = preprocess_and_count(text)
    L = (num_chars / num_words) * 100
    S = (num_sentences / num_words) * 100
    return 0.0588 * L - 0.296 * S - 15.8



# Sentence Length Variability Feature

In [38]:
# Parse each essay into a list of words
df['parse'] = df['text'].str.split()
# Score each essay on Setence Length Variability
df['SLV'] = df['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Score each essay on Setence Length Variability
df['SLV'] = df['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Remove Parse Data
df.drop(['parse'], axis=1, inplace=True)
df.SLV

# Parse each essay into a list of words
df_test['parse'] = df_test['text'].str.split()
# Score each essay on Setence Length Variability
df_test['SLV'] = df_test['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Score each essay on Setence Length Variability
df_test['SLV'] = df_test['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Remove Parse Data
df_test.drop(['parse'], axis=1, inplace=True)
df_test.SLV

KeyboardInterrupt: 

# Conjunction Count Feature

In [None]:
conjunctions = [
    "and", "but", "or", "so", "yet", "nor", "for", "after", "although",
    "as", "because", "before", "if", "once", "since", "that", "though",
    "till", "unless", "while", "where", "whether", "because of", "in order that",
    "even though", "as long as", "as soon as", "just as", "so that", "in case",
    "now that", "as if", "provided that", "whereas", "inasmuch as", "whenever",
    "until", "while", "after all", "as though", "lest", "regardless", "apart from",
    "given that", "if only", "in case that", "in spite of", "on the condition that",
    "only if", "supposing", "as far as", "in the event that", "not to mention",
    "rather than", "such that", "to the extent that", "although", "despite",
    "much as", "whether or not", "assuming that", "besides", "conversely",
    "except that", "in order to", "like", "provided", "save that", "that is to say",
    "to the end that", "wherever", "whiles", "by the time", "even if",
    "on condition that", "so long as", "apart from that", "even when", "if then",
    "in as much as", "in spite of the fact that", "in the same way that",
    "not only but also", "notwithstanding", "presuming that", "rather", "seeing that",
    "unless and until", "whereas as", "whether or no", "as against", "as well as",
    "in accordance with", "in addition to", "in relation to", "in the light of",
    "not to speak of", "regardless of the fact that", "so as to", "with regard to"]
# Create a new column for the number of conjunctions in each essay
df["num_conjunctions"] = np.zeros(len(df))

# Use a regular expression to find conjunctions in the text
conjunctions_regex = r'\b(?:' + '|'.join(conjunctions) + r')\b'

# Count occurrences of each conjunction in the 'text' column
df['num_conjunctions'] = df['text'].str.count(conjunctions_regex)

df.num_conjunctions

# Create a new column for the number of conjunctions in each essay
df_test["num_conjunctions"] = np.zeros(len(df_test))

# Use a regular expression to find conjunctions in the text
conjunctions_regex = r'\b(?:' + '|'.join(conjunctions) + r')\b'

# Count occurrences of each conjunction in the 'text' column
df_test['num_conjunctions'] = df_test['text'].str.count(conjunctions_regex)

df_test.num_conjunctions


0    0
1    0
2    0
Name: num_conjunctions, dtype: int64

# Readability Features

In [None]:
df['flesch_reading_ease'] = df['text'].apply(flesch_reading_ease)
df['flesch_kincaid_grade'] = df['text'].apply(flesch_kincaid_grade)
df['gunning_fog'] = df['text'].apply(gunning_fog)
df['smog_index'] = df['text'].apply(smog_index)
df['automated_readability_index'] = df['text'].apply(automated_readability_index)
df['coleman_liau_index'] = df['text'].apply(coleman_liau_index)

# Applying readability tests to each essay
df_test['flesch_reading_ease'] = df_test['text'].apply(flesch_reading_ease)
df_test['flesch_kincaid_grade'] = df_test['text'].apply(flesch_kincaid_grade)
df_test['gunning_fog'] = df_test['text'].apply(gunning_fog)
df_test['smog_index'] = df_test['text'].apply(smog_index)
df_test['automated_readability_index'] = df_test['text'].apply(automated_readability_index)
df_test['coleman_liau_index'] = df_test['text'].apply(coleman_liau_index)

# Scaling Numerical Data

In [None]:
#scale numerical data
scaler = MinMaxScaler()
columns_scale = df.drop(['text', 'generated','id'], axis=1).columns
scaled_data = scaler.fit_transform(df[columns_scale])
scaled_df = pd.DataFrame(scaled_data, columns=columns_scale)
df_scaled = pd.concat([df[['id','prompt_id','text']],scaled_df], axis=1)


#scale numerical data
scaler = MinMaxScaler()
columns_test_scale = df_test.drop(['text','id'], axis=1).columns
scaled_test_data = scaler.fit_transform(df_test[columns_test_scale])
scaled_test_df = pd.DataFrame(scaled_test_data, columns=columns_test_scale)
df_test_scaled = pd.concat([df_test[['id','prompt_id','text']],scaled_test_df], axis=1)

df_scaled

Unnamed: 0,id,prompt_id,text,prompt_id.1,SLV,num_conjunctions,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index
0,4154d754,0,Advantages of Limiting Car Usage\n\nLimiting c...,0.0,0.500800,0.169492,0.408741,0.279364,0.307874,0.289240,0.246485,0.677825
1,743e9f23,1,I argue in favor of keeping the Electoral Coll...,1.0,0.375219,0.237288,0.634928,0.206408,0.225866,0.263264,0.186149,0.372702
2,b36d87f5,1,"[Your Name]\n[Your Address]\n[City, State, Zip...",1.0,0.460174,0.220339,0.417743,0.271031,0.278243,0.275788,0.249351,0.713691
3,f7aa848d,1,"Dear senator, I am not in favor of the elector...",1.0,0.457850,0.330508,0.684419,0.168136,0.169114,0.211579,0.149670,0.334467
4,38806fc0,1,Since our very first President George Washingt...,1.0,0.378589,0.220339,0.593875,0.196474,0.190287,0.219302,0.189902,0.521093
...,...,...,...,...,...,...,...,...,...,...,...,...
2773,85b29439,1,"Dear The Florida State Senator, In our nation,...",1.0,0.448267,0.177966,0.665618,0.164765,0.177445,0.193086,0.142594,0.361828
2774,74fe2a5c,1,"[Your Name]\n[Your Address]\n[City, State, Zip...",1.0,0.449516,0.322034,0.394378,0.299548,0.314854,0.325502,0.265323,0.672441
2775,463ad422,1,Many people feel that the president plays a ve...,1.0,0.317626,0.228814,0.718808,0.139050,0.138237,0.168666,0.141441,0.383007
2776,f837fc25,1,"Dear Senator, The electoral college has existe...",1.0,0.227890,0.237288,0.562964,0.245217,0.266683,0.303931,0.225390,0.462857


# Creating Text Matrix 

In [None]:


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
x_tfidf = tfidf_vectorizer.fit_transform(df['text'])
# Combine with Aditional Features
combined_x = scipy.sparse.hstack([x_tfidf, df.drop(['id','prompt_id','text'], axis=1).values])

x_test_tfidf = tfidf_vectorizer.transform(df_test['text'])
# Combine with Aditional Features
combined_test_x = scipy.sparse.hstack([x_test_tfidf, df_test.drop(['id','text'], axis=1).values])
print(x_test_tfidf)




# Polish final secitons for model


In [41]:
X_train = combined_x
X_test = combined_test_x
y_train = df['generated']


# Fine Tuning Hyper Parameters

In [45]:
# Parameter Grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}
xgb_classifier = xgb.XGBClassifier(eval_metric='logloss')

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, n_jobs=-1,verbose=5)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV 3/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=0.528 total time=   4.9s
[CV 2/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=0.511 total time=   5.3s
[CV 2/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1;, score=0.499 total time=   5.3s
[CV 3/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1;, score=0.521 total time=   5.6s
[CV 1/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=0.508 total time=   5.6s
[CV 1/3] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1;, s

KeyboardInterrupt: 

# Build Model and Test

In [31]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

In [34]:

probabilities = xgb_model.predict_proba(X_test)[:, 1]  # For probabilities
predictions = xgb_model.predict(X_test)  # For class labels

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': df_test['id'],  # Ensure this is the correct ID column from your test set
    'generated': probabilities
})

# Save to CSV file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

submission_df

Unnamed: 0,id,generated
0,0000aaaa,0.999347
1,1111bbbb,0.999347
2,2222cccc,0.999347
