In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/llm-7-prompt-training-dataset/train_essays_7_prompts.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Basic Data Analysis
# Distribution of labels
label_counts = data['label'].value_counts()

# Length of essays
data['essay_length'] = data['text'].apply(lambda x: len(word_tokenize(x)))

# Text Data Analysis
# Most common words
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

all_words = ' '.join(data['text']).lower()
all_words_tokens = word_tokenize(all_words)
filtered_words = [word for word in all_words_tokens if word.isalnum() and word not in stop_words]
word_freq = Counter(filtered_words)

# Visualization
# Distribution of essay lengths
plt.figure(figsize=(10, 6))
sns.histplot(data['essay_length'], kde=True)
plt.title('Distribution of Essay Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# Bar chart for the frequency of common words
common_words_df = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=common_words_df)
plt.title('Top 20 Most Common Words')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

# Word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(filtered_words))

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Essay Texts')
plt.show()


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = re.sub(r'\W+', ' ', text.lower())
    tokens = text.split()
    return tokens

# Basic Data Analysis
# Distribution of labels
label_counts = data['label'].value_counts()

# Length of essays
data['essay_length'] = data['text'].apply(lambda x: len(clean_and_tokenize(x)))

# Text Data Analysis
# Most common words
filtered_words = [word for text in data['text'] for word in clean_and_tokenize(text)]
word_freq = Counter(filtered_words)

# Visualization
# Label distribution
plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of AI-generated vs Human-generated Essays')
plt.axis('equal')
plt.show()

# Average essay length by label
avg_essay_length = data.groupby('label')['essay_length'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(x='label', y='essay_length', data=avg_essay_length)
plt.title('Average Essay Length by Label')
plt.xlabel('Label')
plt.ylabel('Average Number of Words')
plt.xticks([0, 1], ['AI-generated', 'Human-generated'])
plt.show()

# Common words by label
ai_text = ' '.join(data[data['label'] == 0]['text'])
human_text = ' '.join(data[data['label'] == 1]['text'])

ai_words = clean_and_tokenize(ai_text)
human_words = clean_and_tokenize(human_text)

ai_word_freq = Counter(ai_words)
human_word_freq = Counter(human_words)

common_ai_words_df = pd.DataFrame(ai_word_freq.most_common(20), columns=['Word', 'Frequency'])
common_human_words_df = pd.DataFrame(human_word_freq.most_common(20), columns=['Word', 'Frequency'])

# Bar chart for the frequency of common words (AI-generated)
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=common_ai_words_df)
plt.title('Top 20 Most Common Words in AI-generated Essays')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

# Bar chart for the frequency of common words (Human-generated)
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=common_human_words_df)
plt.title('Top 20 Most Common Words in Human-generated Essays')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

# Bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
bigrams_matrix = vectorizer.fit_transform(data['text'])
bigrams = vectorizer.get_feature_names_out()
bigram_freq = bigrams_matrix.sum(axis=0).A1
bigram_freq_df = pd.DataFrame(list(zip(bigrams, bigram_freq)), columns=['Bigram', 'Frequency'])
bigram_freq_df = bigram_freq_df.sort_values(by='Frequency', ascending=False).head(20)

# Bar chart for the frequency of bigrams
plt.figure(figsize=(12, 8))
sns.barplot(x='Frequency', y='Bigram', data=bigram_freq_df)
plt.title('Top 20 Most Common Bigrams in Essays')
plt.xlabel('Frequency')
plt.ylabel('Bigram')
plt.show()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load the dataset
file_path = '/kaggle/input/llm-7-prompt-training-dataset/train_essays_7_prompts.csv'
data = pd.read_csv(file_path)

# Preprocess the Data
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['label']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print("Model Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the coefficients from the logistic regression model
coefficients = model.coef_[0]

# Create a dataframe for feature importances
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the dataframe by absolute value of the coefficient
feature_importances['AbsCoefficient'] = np.abs(feature_importances['Coefficient'])
feature_importances = feature_importances.sort_values(by='AbsCoefficient', ascending=False).head(20)

# Plot the feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=feature_importances)
plt.title('Top 20 Most Important Features')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['AI-generated', 'Human-generated'], yticklabels=['AI-generated', 'Human-generated'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# ROC Curve
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['AI-generated', 'Human-generated'], yticklabels=['AI-generated', 'Human-generated'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/llm-7-prompt-training-dataset/train_essays_7_prompts_v2.csv'
data = pd.read_csv(file_path)

# Function to get sample essays
def get_sample_essays(data, label, num_samples=3):
    samples = data[data['label'] == label].sample(n=num_samples, random_state=42)
    return samples['text'].tolist()

# Get sample AI-generated essays
ai_generated_samples = get_sample_essays(data, label=0, num_samples=3)
print("Sample AI-generated Essays:")
for i, essay in enumerate(ai_generated_samples, 1):
    print(f"\nEssay {i}:\n{essay}\n")

# Get sample human-generated essays
human_generated_samples = get_sample_essays(data, label=1, num_samples=3)
print("Sample Human-generated Essays:")
for i, essay in enumerate(human_generated_samples, 1):
    print(f"\nEssay {i}:\n{essay}\n")


In [None]:
# Function to preprocess and predict
def predict_essay(text):
    # Preprocess the text
    text_transformed = tfidf_vectorizer.transform([text])
    
    # Make a prediction
    prediction = model.predict(text_transformed)
    
    # Map prediction to label
    label_map = {0: 'AI-generated', 1: 'Human-generated'}
    return label_map[prediction[0]]

# Example usage
example_essay_ai = """INTRODUCTION

The Mona Lisa is one paint that is famous in the world, according to some new computer software can reconise emotions in people, this system is a innovation of Prof. Thomas Haung, of the Beckman and is working with Prof. Nicu Sebe, they will have help from Dr. Paul Eckman institute of Advanced Science at the University of Illinois man, creator crator of FACS.

BODY

This system has some questions that people thikn, howcan a computer recognize subtle facial movements we humans use to express how we feel? Dr. Haung said that the procces begins when the computer structs a 3-D computer model of the face; all 44 major muscles in the model must move like human muscles. Dr. Eckman has classified six basic emotions- happiness, surprise, anger, disgust, fear, and sadness - and then assosiated each with characteristic movements of the facial muscles. "The facial expressions for each emotion are universal" observes Dr. Haung.

A classroom computer could recognize when a student is becoming confused or bored, Dr. Haung predicts. Then it could modify the lesson, like an effective human instructor. Most human communication is nonverval, including emotional communication, notes Dr. Haung. According to the Facila Feedback Theory of Emotion, muving your facial muscles not only expresses emotions, but also help to produce them. whoever thought that making faces could reveal so much about the scien of emotions.

CONCLUSION

In conclusion i tink that this program fantastic because can know how we feal, it identify human emotions just looking our face, it is a nice system because i can know if one person that a love is sad, happy, angry, etc. i thik that this system is goin to be important in the future."""
prediction = predict_essay(example_essay_ai)
print(f"The essay is predicted to be: {prediction}")

In [None]:
example_essay_human = """Dear Senator [Last Name],

I am writing to discuss the current method the United States uses to elect the president: the Electoral College. Some citizens, including me, have expressed concerns about this system, and there seems to be an increasing sentiment in favor of shifting to a popular vote method. Through this letter, I wish to voice my reasons for supporting this shift, while also giving due weight to counterarguments.

The Electoral College, as explained by the Office of the Federal Register, is an indirect voting system where citizens vote for electors, who then elect the President (Text 1, Paragraph 2). However, as Bradford Plumer argues in his article, "The Indefensible Electoral College", this method can lead to results that do not align with the popular vote, causing a president to be elected with minority support (Text 9). Plumer moreover points out the "disaster factor," wherein potential issues with electors, or an electoral tie, could significantly disrupt the election process (Text 11, Text 12).

On the opposing side, Judge Richard A. Posner provides several practical reasons in defense of the Electoral College. He mentions its function in ensuring a candidate's appeal beyond specific regions, focusing attention on the most thoughtful and deciding voters, balancing the political weight of large states, and avoiding possibilities of run-off elections (Text 15-22). While these are valid points, I contend they do not outweigh the fundamental principle of democracy that each citizen's vote should hold equal influence.

It is unfair, as Plumer suggests, that due to the winner-take-all system, residents in certain states are seldom prioritized by the candidates, who instead focus on ' swing' states (Text 13). This reality disengages many potential voters who feel their votes will not count. The Electoral College does not incentivize the candidates to broaden their appeal to voters in “safe” states.

Furthermore, the current system's opacity often confuses voters, leading to accidental voting for the wrong candidate (Text 10). In contrast, a direct popular vote method is comparatively more transparent and empowering for the citizens, promoting a more inclusive democratic process.

I acknowledge that removing the Electoral College, fixed in the Constitution, entails a significant and complex alteration in our nation's structure, and therefore should not be hastily enacted without thorough examination and debate. However, I implore you to consider promoting discussions on this potential transition towards direct popular vote to ensure that every citizen's voice is equally valued in our democracy.

I look forward to your thoughtful consideration on this matter.

Best Regards, 

[Your Name]"""
prediction = predict_essay(example_essay_human)
print(f"The essay is predicted to be: {prediction}")