# Feature extraction and Data Visualization

ChatGPT's response when asked for traits that could be useful in identifying ai-generated articles.

1)  Average sentence length: AI-generated articles may have longer or shorter sentences than human-written articles on average.

2)  Punctuation usage: AI-generated articles may use certain types of punctuation, such as exclamation points or ellipses, more or less frequently than human-written articles.

3)  Word frequency: AI-generated articles may use certain words more or less frequently than human-written articles. For example, AI-generated articles may use more technical or jargon-y terms, or may use certain phrases or idioms less frequently.

4)  Part of speech tagging: By analyzing the parts of speech used in the article (such as nouns, verbs, and adjectives), we may be able to identify patterns that are more common in AI-generated articles.

5)  Named entity recognition: By analyzing the named entities (such as people, places, and organizations) mentioned in the article, we may be able to identify patterns that are more common in AI-generated articles.

6)  Syntactic complexity: By analyzing the complexity of sentence structures (such as the number of dependent clauses or subordinating conjunctions used), we may be able to identify patterns that are more common in AI-generated articles.

7)  Readability score: By calculating a readability score (such as the Flesch-Kincaid readability score), we may be able to identify patterns that are more common in AI-generated articles.


In [None]:
import pandas as pd
import re
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import string
import scipy.stats as stats
import nltk
from collections import Counter
from readability import Readability

In [None]:
df = pd.read_csv("final_dataset.csv")
df['Content'].replace('', np.nan, inplace=True)
df['Content'] = df.Content.apply(lambda x : x.strip()) #to remove the whitespace from the beginning and at the end of the string
df['Title'] = df.Title.apply(lambda x : x.strip())


#drop NA values
df = df.dropna()

# Select only specific columns from the dataset
selected_columns = ['Title', 'Content', 'Source']
df = df[selected_columns]


## Average Sentence Length

In [None]:
def average_sentence_length(text):
    # Split text into sentences using regular expressions
    sentences = re.findall(r'\b[\w\s\',-]+\b[.?!]', text)
    # Calculate the average sentence length
    total_words = sum(len(sentence.split()) for sentence in sentences)
    num_sentences = len(sentences)
    if num_sentences > 0:
        return total_words / num_sentences
    else:
        return None

In [None]:
df['AvgSentenceLength'] = df.Content.apply(lambda x : average_sentence_length(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]
real = real.dropna()


print(f'Mean sentence length for ai articles: {ai.AvgSentenceLength.mean()}, std: {ai.AvgSentenceLength.std()}')
print(f'Mean sentence length for real articles: {real.AvgSentenceLength.mean()}, std: {real.AvgSentenceLength.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(5, 40)
axes[1].set_xlim(5, 40)
sns.histplot(ai, x='AvgSentenceLength', color="orchid", ax=axes[0]).set(title='Average Sentence Length of AI articles')
sns.histplot(real, x='AvgSentenceLength', color="skyblue", ax=axes[1]).set(title='Average Sentence Length of Real articles')

In [None]:
real = real.dropna()
stats.ttest_ind(ai.AvgSentenceLength, real.AvgSentenceLength)

In [None]:
df = df.dropna()
stats.pointbiserialr(df.AvgSentenceLength, df.Source)

## Punctuation Usage

In [None]:
def calculate_punctuation_percentage(text):
    # Remove all whitespace characters from the text
    text = "".join(text.split())

    # Calculate the length of the text and the length of the punctuation characters
    text_length = len(text)
    punctuation_length = len([c for c in text if c in string.punctuation])

    # Calculate the percentage of the text that is punctuation
    punctuation_percentage = (punctuation_length / text_length) * 100

    return punctuation_percentage

In [None]:
df['PunctuationPercentage'] = df.Content.apply(lambda x : calculate_punctuation_percentage(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]


print(f'Mean punctuation percentage for ai articles: {ai.PunctuationPercentage.mean()}, std: {ai.PunctuationPercentage.std()}')
print(f'Mean punctuation percentage for real articles: {real.PunctuationPercentage.mean()}, std: {real.PunctuationPercentage.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 9)
axes[1].set_xlim(0, 9)
sns.histplot(ai, x='PunctuationPercentage', color="orchid", ax=axes[0]).set(title='Punctuation Percentage of AI articles')
sns.histplot(real, x='PunctuationPercentage', color="skyblue", ax=axes[1]).set(title='Punctuation Percentage of Real articles')

In [None]:
real = real.dropna()
stats.ttest_ind(ai.PunctuationPercentage, real.PunctuationPercentage)

In [None]:
df = df.dropna()
stats.pointbiserialr(df.PunctuationPercentage, df.Source)

### More specific punctuation

In [None]:
def calculate_period_percentage(text):
    # Remove all whitespace characters from the text
    text = "".join(text.split())

    # Calculate the length of the text and the length of the punctuation characters
    period_length = len([c for c in text if c == '.'])
    punctuation_length = len([c for c in text if c in string.punctuation])

    # Calculate the percentage of the text that is punctuation
    period_percentage = (period_length / punctuation_length) * 100

    return period_percentage

def calculate_comma_percentage(text):
    # Remove all whitespace characters from the text
    text = "".join(text.split())

    # Calculate the length of the text and the length of the punctuation characters
    comma_length = len([c for c in text if c == ','])
    punctuation_length = len([c for c in text if c in string.punctuation])

    # Calculate the percentage of the text that is punctuation
    comma_percentage = (comma_length / punctuation_length) * 100

    return comma_percentage

In [None]:
df['PeriodPercentage'] = df.Content.apply(lambda x : calculate_period_percentage(x))
df['CommaPercentage'] = df.Content.apply(lambda x : calculate_comma_percentage(x))
df['OtherPercentage'] = df.Content.apply(lambda x : 100 - calculate_comma_percentage(x) - calculate_period_percentage(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]


print(f'Mean period percentage for ai articles: {ai.PeriodPercentage.mean()}, std: {ai.PeriodPercentage.std()}')
print(f'Mean period percentage for real articles: {real.PeriodPercentage.mean()}, std: {real.PeriodPercentage.std()}\n')
print(f'Mean comma percentage for ai articles: {ai.CommaPercentage.mean()}, std: {ai.CommaPercentage.std()}')
print(f'Mean comma percentage for real articles: {real.CommaPercentage.mean()}, std: {real.CommaPercentage.std()}\n')
print(f'Mean other percentage for ai articles: {ai.OtherPercentage.mean()}, std: {ai.OtherPercentage.std()}')
print(f'Mean other percentage for real articles: {real.OtherPercentage.mean()}, std: {real.OtherPercentage.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(10, 80)
axes[1].set_xlim(10, 80)
sns.histplot(ai, x='PeriodPercentage', color="orchid", ax=axes[0]).set(title='Period Percentage of AI articles')
sns.histplot(real, x='PeriodPercentage', color="skyblue", ax=axes[1]).set(title='Period Percentage of Real articles')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 70)
axes[1].set_xlim(0, 70)
sns.histplot(ai, x='CommaPercentage', color="orchid", ax=axes[0]).set(title='Comma Percentage of AI articles')
sns.histplot(real, x='CommaPercentage', color="skyblue", ax=axes[1]).set(title='Comma Percentage of Real articles')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 70)
axes[1].set_xlim(0, 70)
sns.histplot(ai, x='OtherPercentage', color="orchid", ax=axes[0]).set(title='Other Percentage of AI articles')
sns.histplot(real, x='OtherPercentage', color="skyblue", ax=axes[1]).set(title='Other Percentage of Real articles')

In [None]:
real = real.dropna()
print(stats.ttest_ind(ai.PeriodPercentage, real.PeriodPercentage))
real = real.dropna()
print(stats.ttest_ind(ai.CommaPercentage, real.CommaPercentage))
real = real.dropna()
print(stats.ttest_ind(ai.OtherPercentage, real.OtherPercentage))

## Vocabulary Frequency

Uses a type-token ration, basically the number of unique words / total number of words

In [None]:
def vocabulary_richness(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Calculate the number of unique words (types)
    types = set(words)
    num_types = len(types)
    
    # Calculate the total number of words (tokens)
    num_tokens = len(words)
    
    # Calculate the type-token ratio
    if num_tokens > 0:
        ttr = num_types / num_tokens*100
    else:
        ttr = 0
        
    return ttr

In [None]:
df['VocabRichness'] = df.Content.apply(lambda x : vocabulary_richness(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]


print(f'Mean Vocab richness percentage for ai articles: {ai.VocabRichness.mean()}, std: {ai.VocabRichness.std()}')
print(f'Mean Vocab richness percentage for real articles: {real.VocabRichness.mean()}, std: {real.VocabRichness.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 9)
axes[1].set_xlim(0, 9)
sns.histplot(ai, x='VocabRichness', color="orchid", ax=axes[0]).set(title='Vocabulary Richness of AI articles')
sns.histplot(real, x='VocabRichness', color="skyblue", ax=axes[1]).set(title='Vocabulary Richness of Real articles')

In [None]:
real = real.dropna()
stats.ttest_ind(ai.VocabRichness, real.VocabRichness)

In [None]:
df = df.dropna()
stats.pointbiserialr(df.VocabRichness, df.Source)

## Parts of Speech

In [None]:
def count_parts_of_speech(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Tag each word with its part of speech
    tagged_words = nltk.pos_tag(words)

    # Count the number of each part of speech
    counts = Counter(tag for word, tag in tagged_words)
    counts = dict(counts)

    noun = 0
    verb = 0
    adverb = 0
    pronoun = 0
    adjective = 0
    for i in counts:
        if i.startswith('NN'):
            noun += counts[i]
        elif i.startswith('VB'):
            verb += counts[i]
        elif i.startswith('RB'):
            adverb += counts[i]
        elif i.startswith('PRP'):
            pronoun += counts[i]
        elif i.startswith('JJ'):
            adjective += counts[i]

    wc = len(text.split(' '))

    return {'noun':noun/wc, 'verb':verb/wc, 'adverb':adverb/wc, 'pronoun':pronoun/wc, 'adjective':adjective/wc}

In [None]:
df['pos'] = df.Content.apply(lambda x : count_parts_of_speech(x))

In [None]:
df['Noun'] = df.pos.apply(lambda x : x['noun'])
df['Verb'] = df.pos.apply(lambda x : x['verb'])
df['Adverb'] = df.pos.apply(lambda x : x['adverb'])
df['Pronoun'] = df.pos.apply(lambda x : x['pronoun'])
df['Adjective'] = df.pos.apply(lambda x : x['adjective'])
df = df.drop(labels=['pos'], axis=1)

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]

print(f'Mean Noun percentage for ai articles: {ai.Noun.mean()}, std: {ai.Noun.std()}')
print(f'Mean Noun percentage for real articles: {real.Noun.mean()}, std: {real.Noun.std()}\n')
print(f'Mean Verb percentage for ai articles: {ai.Verb.mean()}, std: {ai.Verb.std()}')
print(f'Mean Verb percentage for real articles: {real.Verb.mean()}, std: {real.Verb.std()}\n')
print(f'Mean Adverb percentage for ai articles: {ai.Adverb.mean()}, std: {ai.Adverb.std()}')
print(f'Mean Adverb percentage for real articles: {real.Adverb.mean()}, std: {real.Adverb.std()}\n')
print(f'Mean Pronoun percentage for ai articles: {ai.Pronoun.mean()}, std: {ai.Pronoun.std()}')
print(f'Mean Pronoun percentage for real articles: {real.Pronoun.mean()}, std: {real.Pronoun.std()}\n')
print(f'Mean Adjective percentage for ai articles: {ai.Adjective.mean()}, std: {ai.Adjective.std()}')
print(f'Mean Adjective percentage for real articles: {real.Adjective.mean()}, std: {real.Adjective.std()}\n')

## Named entity recognition

In [None]:
def count_named_entities(text):
    # Tokenize the input text into sentences
    sentences = nltk.sent_tokenize(text)

    # Tokenize each sentence into words and tag the words with part-of-speech labels
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentences]

    # Use NLTK's named entity recognizer to identify named entities in the tagged sentences
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    # Extract the named entities and their labels from the chunked sentences
    named_entities = []
    for chunked_sent in chunked_sentences:
        for tree in chunked_sent:
            if hasattr(tree, 'label') and tree.label() == 'NE':
                named_entities.append(' '.join([child[0] for child in tree]))

    # Count the number of each named entity type
    counts = Counter(named_entities)

    count = 0
    for item in counts.values():
        count += item

    return count / len(text.split(" ")) * 100


In [None]:
df['NamedEntities'] = df.Content.apply(lambda x : count_named_entities(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]

print(f'Mean Named Entities Frequency for real articles: {real.NamedEntities.mean()}, std: {real.NamedEntities.std()}')
print(f'Mean Named Entities Frequency for ai articles: {ai.NamedEntities.mean()}, std: {ai.NamedEntities.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 20)
axes[1].set_xlim(0, 20)
sns.histplot(ai, x='NamedEntities', color="orchid", ax=axes[0]).set(title='Named Entities Frequency of AI articles')
sns.histplot(real, x='NamedEntities', color="skyblue", ax=axes[1]).set(title='Named Entities Frequency of Real articles')

In [None]:
stats.ttest_ind(ai.NamedEntities, real.NamedEntities)

In [None]:
stats.pointbiserialr(df.NamedEntities, df.Source)

## Syntactic complexity

## Readability score

In [None]:
def compute_readability(text):
    if len(text.split(' ')) > 100:
        r = Readability(text)
        return r.flesch_kincaid().score
    else:
        return None

In [None]:
df['Readability'] = df.Content.apply(lambda x : compute_readability(x))

In [None]:
ai = df[df.Source == 1]
real = df[df.Source == 0]

print(f'Mean Readability Score for ai articles: {ai.Readability.mean()}, std: {ai.Readability.std()}')
print(f'Mean Readability Score for real articles: {real.Readability.mean()}, std: {real.Readability.std()}')

In [None]:
f, axes = plt.subplots(1, 2, figsize=(9,4))
axes[0].set_xlim(0, 25)
axes[1].set_xlim(0, 25)
sns.histplot(ai, x='Readability', color="orchid", ax=axes[0]).set(title='Readability Score of AI articles')
sns.histplot(real, x='Readability', color="skyblue", ax=axes[1]).set(title='Readability Score of Real articles')

In [None]:
real = real.dropna()
ai = ai.dropna()
stats.ttest_ind(ai.Readability, real.Readability)

In [None]:
df = df.dropna()
stats.pointbiserialr(df.Readability, df.Source)