In [None]:
# Import necessary libraries
import pandas as pd

# Load the balanced dataset
data_balanced = pd.read_csv('balanced_dataset.csv')
data_balanced.head()


In [None]:
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Tokenize and count common words for each class
def get_common_words(texts, n=20):
    words = [word.lower() for text in texts for word in text.split() if word.lower() not in stop_words]
    return Counter(words).most_common(n)

# Common words in human-written text
human_texts = data_balanced[data_balanced['label'] == 0]['text']
common_human_words = get_common_words(human_texts)
print("Most common words in human-written text:", common_human_words)

# Common words in AI-generated text
ai_texts = data_balanced[data_balanced['label'] == 1]['text']
common_ai_words = get_common_words(ai_texts)
print("Most common words in AI-generated text:", common_ai_words)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample a subset to avoid memory issues
sample_data = data_balanced['text'].sample(5000, random_state=42)  # Adjust if needed

# Define the vectorizer with reduced n-gram range and max features
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=500, stop_words='english')  # Bigrams only, limited to 500 features

# Fit and transform the sample data
X = vectorizer.fit_transform(sample_data)

# Get top n-grams with counts
ngram_counts = X.sum(axis=0).A1  # .A1 converts sparse matrix to array
ngram_features = vectorizer.get_feature_names()  # Use get_feature_names() for older versions
top_ngrams = sorted(zip(ngram_features, ngram_counts), key=lambda x: x[1], reverse=True)[:20]

# Display the top 20 n-grams
print("Top 20 n-grams:", top_ngrams)


In [None]:
# Word count, sentence length features were calculated in the previous notebook. Display basic stats by class.
print("Word Count by Class:")
print(data_balanced.groupby('label')['word_count'].describe())

print("Sentence Count by Class:")
print(data_balanced.groupby('label')['sentence_count'].describe())


In [None]:
# Define a function to count punctuation
import string

def count_punctuation(text):
    return {p: text.count(p) for p in string.punctuation}

# Apply to each text entry
data_balanced['punctuation_counts'] = data_balanced['text'].apply(count_punctuation)

# Summarize average punctuation usage by class
punctuation_summary = data_balanced.groupby('label')['punctuation_counts'].apply(lambda x: pd.DataFrame(x.tolist()).mean())
print("Average punctuation usage by class:")
print(punctuation_summary)


In [None]:

from textblob import TextBlob
import pandas as pd

# Sample data for demonstration, using the balanced dataset
data_balanced = pd.read_csv('balanced_dataset.csv')  # Load the balanced dataset

# Define a function to extract POS counts
def get_pos_counts(text):
    blob = TextBlob(text)
    pos_counts = {"NOUN": 0, "VERB": 0, "ADJ": 0, "ADV": 0}
    for word, pos in blob.tags:
        if pos.startswith("NN"):  # Nouns
            pos_counts["NOUN"] += 1
        elif pos.startswith("VB"):  # Verbs
            pos_counts["VERB"] += 1
        elif pos.startswith("JJ"):  # Adjectives
            pos_counts["ADJ"] += 1
        elif pos.startswith("RB"):  # Adverbs
            pos_counts["ADV"] += 1
    return pos_counts

# Apply POS tagging to the dataset and store results in new columns
data_balanced["pos_counts"] = data_balanced["text"].apply(get_pos_counts)

# Convert POS dictionary to separate columns for easy analysis
pos_df = pd.json_normalize(data_balanced["pos_counts"])
data_balanced = pd.concat([data_balanced, pos_df], axis=1)

# Check the average POS counts by class
print("Average POS counts by class:")
print(data_balanced.groupby('label')[["NOUN", "VERB", "ADJ", "ADV"]].mean())


In [None]:
import textstat

# Function to calculate readability score
data_balanced['readability_score'] = data_balanced['text'].apply(textstat.flesch_reading_ease)

# Describe readability scores by class
print("Readability Scores by Class:")
print(data_balanced.groupby('label')['readability_score'].describe())


In [None]:
from textblob import TextBlob

# Type-Token Ratio (TTR)
def ttr(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data_balanced['ttr'] = data_balanced['text'].apply(ttr)

# Lexical Density (ratio of unique content words to all words)
def lexical_density(text):
    blob = TextBlob(text)
    content_words = [word for word, pos in blob.tags if pos.startswith(('NN', 'VB', 'JJ', 'RB'))]
    return len(content_words) / len(blob.words) if len(blob.words) > 0 else 0

data_balanced['lexical_density'] = data_balanced['text'].apply(lexical_density)

# Summarize by class
print("Vocabulary Richness by Class:")
print(data_balanced.groupby('label')[['ttr', 'lexical_density']].describe())



In [None]:
# Average Word Length
import numpy as np

def average_word_length(text):
    words = text.split()
    return np.mean([len(word) for word in words]) if len(words) > 0 else 0

data_balanced['avg_word_length'] = data_balanced['text'].apply(average_word_length)


In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Stop Word Ratio
def stop_word_ratio(text):
    words = text.split()
    stop_word_count = sum(1 for word in words if word.lower() in stop_words)
    return stop_word_count / len(words) if len(words) > 0 else 0

data_balanced['stop_word_ratio'] = data_balanced['text'].apply(stop_word_ratio)


In [None]:
from textblob import TextBlob

# POS Diversity
def pos_diversity(text):
    blob = TextBlob(text)
    pos_tags = [pos for word, pos in blob.tags]
    unique_pos_tags = set(pos_tags)
    return len(unique_pos_tags) / len(pos_tags) if len(pos_tags) > 0 else 0

data_balanced['pos_diversity'] = data_balanced['text'].apply(pos_diversity)


In [None]:
# 4. Unique Word Ratio
def unique_word_ratio(text):
    words = text.split()
    return len(set(words)) / len(words) if len(words) > 0 else 0

data_balanced['unique_word_ratio'] = data_balanced['text'].apply(unique_word_ratio)


In [None]:
# 6. Word Entropy
def word_entropy(text):
    words = text.split()
    word_freq = Counter(words)
    total_words = sum(word_freq.values())
    return -sum((freq / total_words) * np.log2(freq / total_words) for freq in word_freq.values())

data_balanced['word_entropy'] = data_balanced['text'].apply(word_entropy)

In [None]:
# 3. Readability Indices
# Add advanced readability metrics
import textstat
data_balanced['gunning_fog'] = data_balanced['text'].apply(textstat.gunning_fog)
data_balanced['smog_index'] = data_balanced['text'].apply(textstat.smog_index)


In [None]:

# 2. Bigram and Trigram Counts
# Add bigram and trigram frequency features
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=500)
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), stop_words='english', max_features=500)

bigram_matrix = vectorizer_bigram.fit_transform(data_balanced['text'])
trigram_matrix = vectorizer_trigram.fit_transform(data_balanced['text'])

data_balanced['bigram_count'] = bigram_matrix.sum(axis=1).A1
data_balanced['trigram_count'] = trigram_matrix.sum(axis=1).A1

In [None]:
def semantic_density(text):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    if not sentences:
        return 0
    avg_sentence_length = np.mean([len(s.split()) for s in sentences])
    return len(set(text.split())) / avg_sentence_length if avg_sentence_length > 0 else 0

data_balanced['semantic_density'] = data_balanced['text'].apply(semantic_density)


In [None]:
def avg_sentence_complexity(text):
    sentences = text.split('.')
    clause_counts = [sentence.count(',') + 1 for sentence in sentences]
    return np.mean(clause_counts) if len(clause_counts) > 0 else 0

data_balanced['avg_sentence_complexity'] = data_balanced['text'].apply(avg_sentence_complexity)


In [None]:
def burstiness(text):
    sentence_lengths = [len(sentence.split()) for sentence in text.split('.')]
    if len(sentence_lengths) == 0:
        return 0
    mean_length = np.mean(sentence_lengths)
    std_dev = np.std(sentence_lengths)
    return std_dev / mean_length if mean_length > 0 else 0

data_balanced['burstiness'] = data_balanced['text'].apply(burstiness)


In [None]:
data_balanced.to_csv('processed_data_with_features.csv', index=False)
