In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
nltk.download('cmudict')
print('****Import Complete****')

****Import Complete****


[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/hunterlanier/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
# Merging Data Sets
human_data = pd.read_csv('../Data/train_data.csv')
gpt3_data = pd.read_csv('../Data/essays_gpt3.5.csv') 
gpt4_data = pd.read_csv('../Data/essays_gpt4.csv')
df = pd.concat([human_data,gpt3_data,gpt4_data])
df = df.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv('../Challenge/test_essays.csv')
df_test = df_test.sample(frac=1).reset_index(drop=True)


# Readability Functions

In [3]:

# Syllable count in a word
def syllable_count(word, d):
    if word.lower() in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    else:
        # Fallback method for words not in cmudict
        return len(re.findall(r'[aeiouy]+', word.lower()))
# Initialize CMU Pronouncing Dictionary
d = cmudict.dict()

# Helper function to preprocess text and count syllables, words, and sentences
def preprocess_and_count(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    syllables = sum(syllable_count(word, d) for word in words)
    num_sentences = len(sentences)
    num_words = len([word for word in words if word.isalpha()])
    num_syllables = syllables
    num_chars = sum(len(word) for word in words if word.isalpha())
    return num_sentences, num_words, num_syllables, num_chars


# Flesch Reading Ease
def flesch_reading_ease(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    return 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)

# Flesch-Kincaid Grade Level
def flesch_kincaid_grade(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    return 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59

# Gunning Fog Index
def gunning_fog(text):
    num_sentences, num_words, num_syllables, _ = preprocess_and_count(text)
    complex_words = sum(syllable_count(word, d) >= 3 for word in word_tokenize(text))
    return 0.4 * ((num_words / num_sentences) + 100 * (complex_words / num_words))

# SMOG Index
def smog_index(text):
    num_sentences, _, num_syllables, _ = preprocess_and_count(text)
    return 1.043 * (30 * (num_syllables / num_sentences))**0.5 + 3.1291

# Automated Readability Index
def automated_readability_index(text):
    num_sentences, num_words, _, num_chars = preprocess_and_count(text)
    return 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43

# Coleman-Liau Index
def coleman_liau_index(text):
    num_sentences, num_words, _, num_chars = preprocess_and_count(text)
    L = (num_chars / num_words) * 100
    S = (num_sentences / num_words) * 100
    return 0.0588 * L - 0.296 * S - 15.8



# Sentence Length Variability Feature

In [4]:
# Parse each essay into a list of words
df['parse'] = df['text'].str.split()
# Score each essay on Setence Length Variability
df['SLV'] = df['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Score each essay on Setence Length Variability
df['SLV'] = df['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Remove Parse Data
df.drop(['parse'], axis=1, inplace=True)
df.SLV

# Parse each essay into a list of words
df_test['parse'] = df_test['text'].str.split()
# Score each essay on Setence Length Variability
df_test['SLV'] = df_test['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Score each essay on Setence Length Variability
df_test['SLV'] = df_test['parse'].apply(lambda x: np.std([len(word_tokenize(sentence)) for sentence in x]))
# Remove Parse Data
df_test.drop(['parse'], axis=1, inplace=True)
df_test.SLV

0    0.471405
1    0.471405
2    0.471405
Name: SLV, dtype: float64

# Conjunction Count Feature

In [5]:
conjunctions = [
    "and", "but", "or", "so", "yet", "nor", "for", "after", "although",
    "as", "because", "before", "if", "once", "since", "that", "though",
    "till", "unless", "while", "where", "whether", "because of", "in order that",
    "even though", "as long as", "as soon as", "just as", "so that", "in case",
    "now that", "as if", "provided that", "whereas", "inasmuch as", "whenever",
    "until", "while", "after all", "as though", "lest", "regardless", "apart from",
    "given that", "if only", "in case that", "in spite of", "on the condition that",
    "only if", "supposing", "as far as", "in the event that", "not to mention",
    "rather than", "such that", "to the extent that", "although", "despite",
    "much as", "whether or not", "assuming that", "besides", "conversely",
    "except that", "in order to", "like", "provided", "save that", "that is to say",
    "to the end that", "wherever", "whiles", "by the time", "even if",
    "on condition that", "so long as", "apart from that", "even when", "if then",
    "in as much as", "in spite of the fact that", "in the same way that",
    "not only but also", "notwithstanding", "presuming that", "rather", "seeing that",
    "unless and until", "whereas as", "whether or no", "as against", "as well as",
    "in accordance with", "in addition to", "in relation to", "in the light of",
    "not to speak of", "regardless of the fact that", "so as to", "with regard to"]
# Create a new column for the number of conjunctions in each essay
df["num_conjunctions"] = np.zeros(len(df))

# Use a regular expression to find conjunctions in the text
conjunctions_regex = r'\b(?:' + '|'.join(conjunctions) + r')\b'

# Count occurrences of each conjunction in the 'text' column
df['num_conjunctions'] = df['text'].str.count(conjunctions_regex)

df.num_conjunctions

# Create a new column for the number of conjunctions in each essay
df_test["num_conjunctions"] = np.zeros(len(df_test))

# Use a regular expression to find conjunctions in the text
conjunctions_regex = r'\b(?:' + '|'.join(conjunctions) + r')\b'

# Count occurrences of each conjunction in the 'text' column
df_test['num_conjunctions'] = df_test['text'].str.count(conjunctions_regex)

df_test.num_conjunctions


0    0
1    0
2    0
Name: num_conjunctions, dtype: int64

# Readability Features

In [6]:
df['flesch_reading_ease'] = df['text'].apply(flesch_reading_ease)
df['flesch_kincaid_grade'] = df['text'].apply(flesch_kincaid_grade)
df['gunning_fog'] = df['text'].apply(gunning_fog)
df['smog_index'] = df['text'].apply(smog_index)
df['automated_readability_index'] = df['text'].apply(automated_readability_index)
df['coleman_liau_index'] = df['text'].apply(coleman_liau_index)

# Applying readability tests to each essay
df_test['flesch_reading_ease'] = df_test['text'].apply(flesch_reading_ease)
df_test['flesch_kincaid_grade'] = df_test['text'].apply(flesch_kincaid_grade)
df_test['gunning_fog'] = df_test['text'].apply(gunning_fog)
df_test['smog_index'] = df_test['text'].apply(smog_index)
df_test['automated_readability_index'] = df_test['text'].apply(automated_readability_index)
df_test['coleman_liau_index'] = df_test['text'].apply(coleman_liau_index)

# Scaling Numerical Data

In [7]:
#scale numerical data
scaler = MinMaxScaler()
columns_scale = df.drop(['text', 'generated','id'], axis=1).columns
scaled_data = scaler.fit_transform(df[columns_scale])
scaled_df = pd.DataFrame(scaled_data, columns=columns_scale)
df_scaled = pd.concat([df[['id','prompt_id','text']],scaled_df], axis=1)


#scale numerical data
scaler = MinMaxScaler()
columns_test_scale = df_test.drop(['text','id'], axis=1).columns
scaled_test_data = scaler.fit_transform(df_test[columns_test_scale])
scaled_test_df = pd.DataFrame(scaled_test_data, columns=columns_test_scale)
df_test_scaled = pd.concat([df_test[['id','prompt_id','text']],scaled_test_df], axis=1)

df_scaled

Unnamed: 0,id,prompt_id,text,prompt_id.1,SLV,num_conjunctions,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index
0,74280ec3,0,Has it ever seem hotter than it usually has to...,0.0,0.252920,0.203390,0.807488,0.097257,0.069484,0.130386,0.097983,0.255010
1,5768f8fe,0,In the wake of our growing concern for the env...,0.0,0.702733,0.144068,0.273011,0.324275,0.369942,0.301887,0.283874,0.858715
2,7c7bd27d,0,Title: Advantages of Limiting Car Usage: Lesso...,0.0,0.798842,0.135593,0.199762,0.383395,0.393324,0.392102,0.318405,0.828144
3,1a4eba9e,0,"""I'm much happier this way,"" What caused someo...",0.0,0.323966,0.322034,0.648026,0.217210,0.227747,0.289347,0.212360,0.386995
4,51f51983,1,"[Your Name]\n[Your Address]\n[City, State, Zip...",1.0,0.499796,0.237288,0.434311,0.285572,0.303239,0.318657,0.263126,0.656238
...,...,...,...,...,...,...,...,...,...,...,...,...
2773,a154688f,0,Limiting car usage has numerous advantages tha...,0.0,0.363045,0.296610,0.349951,0.290502,0.319677,0.274762,0.246839,0.738766
2774,f61cd2f3,0,Title: The Benefits of Limiting Car Usage: An ...,0.0,0.669209,0.228814,0.311870,0.353428,0.364838,0.392635,0.312299,0.732552
2775,8da67e56,1,"Senator [Your Senator's Name],\n\nI am writing...",1.0,0.615519,0.186441,0.332301,0.339443,0.352619,0.374863,0.302099,0.726805
2776,5b98d126,0,The advantages of limiting car usage are becom...,0.0,0.339958,0.372881,0.429220,0.268984,0.308841,0.278706,0.236375,0.652223


# Creating Text Matrix 

In [8]:


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
x_tfidf = tfidf_vectorizer.fit_transform(df['text'])
# Combine with Aditional Features
combined_x = scipy.sparse.hstack([x_tfidf, df.drop(['id','prompt_id','text'], axis=1).values])

x_test_tfidf = tfidf_vectorizer.transform(df_test['text'])
# Combine with Aditional Features
combined_test_x = scipy.sparse.hstack([x_test_tfidf, df_test.drop(['id','text'], axis=1).values])
print(x_test_tfidf)




# Polish final secitons for model


In [9]:
X_train = combined_x
X_test = combined_test_x
y_train = df['generated']


# Build Model and Test

In [10]:
# Initialize the Support Vector Classifier
model = SVC(probability=True)
model.fit(X_train, y_train)

In [11]:

# Convert the COO matrix to CSR format
X_test_csr = X_test.tocsr()

# Slice the first 5008 features
X_test_sliced = X_test_csr[:, :5008]

# Now you can use predict_proba on the sliced data
probabilities = model.predict_proba(X_test_sliced)[:, 1]

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': df_test['id'],  # Ensure this is the correct ID column from your test set
    'generated': probabilities
})

# Save to CSV file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

ValueError: X has 5008 features, but SVC is expecting 5009 features as input.

In [None]:
submission_df

NameError: name 'submission_df' is not defined