# **Importing Python libraries & "AI vs. human text" dataset**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)'

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import nltk
from nltk.corpus import PlaintextCorpusReader

df = pd.read_csv("/kaggle/input/ai-vs-human-text/AI_Human.csv")
print(df.head())

In [None]:
df['length']=df['text'].apply(len)
df['label'] = df['generated'].apply(lambda x: 'Human' if x == 1 else 'AI')
df

In [None]:
#test_df = df[:1000]
#test_df = df.sample(25, random_state = 28)
test_df = df.sample(10000, random_state = 28)

print(test_df)
all_words = [word.lower() for word in test_df["text"]]

# **Data Exploration**

In [None]:
df.info()
df.isna().sum()

In [None]:
print("Total no. of rows: ")
print(df.shape[0])

In [None]:
import matplotlib.pyplot as plt
ax = df['label'].value_counts().plot(kind='bar', color='green')
for i, val in enumerate(df['generated'].value_counts().sort_index()):
    ax.text(i, val, str(val), ha='center', va='bottom')

# Adding title and labels
plt.title('AI vs Human Text Data')
plt.xlabel('Labels')
plt.ylabel('Text Count')

# Showing the plot
plt.show()

In [None]:
#print(test_df)

In [None]:
#DOES NOT WORK FOR SAMPLING

'''nltk.download('punkt')
tokenized_reviews = []
for i in range(len(test_df['text'])):

    if type(test_df['text'][i])==str:
        tokenized_reviews += [nltk.word_tokenize(test_df['text'][i])]
print(tokenized_reviews[0][:20])'''

In [None]:
#Works for sampling
nltk.download('punkt')

tokenized_reviews = []

for index, row in test_df.iterrows():
    if type(row['text']) == str:
        tokenized_reviews.append(nltk.word_tokenize(row['text']))

print(tokenized_reviews[0][:20])


In [None]:
docs_lower = [[w.lower() for w in doc] for doc in tokenized_reviews]
print(docs_lower[0][0:20])

In [None]:
import re
docs_alpha = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs_lower]
print(docs_alpha[0][0:20])

In [None]:
total_words = 0
for i in docs_alpha:
    total_words += len(i)
print(total_words)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_list = stopwords.words('english')
print(stop_list)
docs_stop=[[w for w in doc if w not in stop_list] for doc in docs_alpha]
print(docs_stop[0][0:20])

In [None]:
words_left= 0
for i in docs_stop:
    words_left += len(i)
print(words_left)

In [None]:
percent_stop = (total_words-words_left)/total_words*100
print('Percentage of stop words: {} %'.format(percent_stop))

In [None]:
from nltk.stem.porter import *
docs_stem=[]
scount= 0
stemmer = PorterStemmer()
for i in docs_stop:
    temp_list=[]
    for w in i:
        temp_list.append(stemmer.stem(w))
        scount+= 1
    docs_stem.append(temp_list)
print(docs_stem[0][0:20])
print(scount)

In [None]:
for i in range(0,len(docs_stem)):
    fdist = nltk.FreqDist(docs_stem[i])
print(fdist.most_common(10))

# **Polarity Scores**

**Adding polarity & subjectivity scores for each line of text. Also added sentiment scores, which quantifies the emotion of each line of text, in terms of positivity, negativity, and neutrality.**

In [None]:
!pip install textblob

In [None]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
print(analyzer)

In [None]:
sentences = test_df["text"]

test_df['polarity_scores'] = {}
test_df

In [None]:
polarity_scores = []
for i in range(len(sentences)):
    my_analyzer = analyzer.polarity_scores(sentences.iloc[i])
    polarity_scores.append(my_analyzer)
test_df['polarity_scores'] = polarity_scores
test_df

In [None]:
def unpack_dictionary(test_df):
    return pd.Series(test_df['polarity_scores'])

new_columns = test_df.apply(unpack_dictionary, axis=1)
test_df = pd.concat([test_df, new_columns], axis=1)
test_df.drop(['polarity_scores'], axis=1, inplace=True)
test_df

In [None]:
# generating subjectivity scores
subjectivity_scores = []
for i in range(len(sentences)):
    sub_score = TextBlob(sentences.iloc[i]).subjectivity
    subjectivity_scores.append(sub_score)
test_df['subjectivity_score'] = subjectivity_scores
test_df

# ****Added document statistics****

Mean of words per text

In [None]:
total=0
for i in docs_alpha:
    total += len(i)
print(total/len(docs_alpha))

unique word count

In [None]:
words = []
count = 0
for i in docs_stem:
    for j in i:
        if j not in words:
            words.append(j)
            count += 1
print(count)

# **Cosine Similarity**

In [None]:
!python --version

In [None]:
"""
Transformer libraries useful to using the pretrained model and data preprocessing
"""
"""import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, AutoModelForSequenceClassification

"""
#Similarity search section: cosine similarity search and facebook AI research library
"""
from sklearn.metrics.pairwise import cosine_similarity
!pip install faiss-gpu # please uncomment this line when you're running the notebook for the first time
import faiss"""

In [None]:
"""# Get the SciBERT pretrained model path from Allen AI repo
pretrained_model = 'allenai/scibert_scivocab_uncased'

# Get the tokenizer from the previous path
sciBERT_tokenizer = BertTokenizer.from_pretrained(pretrained_model, 
                                          do_lower_case=True)

# Get the model
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model,
                                                          output_attentions=False,
                                                          output_hidden_states=True)"""

In [None]:
"""from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(docs_stem)"""

# **Readability Scores**

In [None]:
!pip install textstat
!pip install py-readability-metrics
!pip install readability-lxml

In [None]:
import textstat

# Function to calculate Flesch Reading Ease score
def calculate_flesch_reading_ease(text):
    return textstat.flesch_reading_ease(text)

# Apply the function to the 'text' column
test_df['flesch_reading_ease_score'] = test_df['text'].apply(calculate_flesch_reading_ease)

def calculate_flesch_kincaid(text):
#     return Readability(text).flesch_kincaid()
    return textstat.flesch_kincaid_grade(text)

test_df['flesch_kincaid_grade'] = test_df['text'].apply(calculate_flesch_kincaid)

print(test_df.head())

# **Perplexity Scores**



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load pre-trained GPT-2 language model
model = GPT2LMHeadModel.from_pretrained("gpt2")


def calculate_perplexity(text):
    # Tokenizes the input text, but some are too long, thus truncation is activated
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation = True)

    # Forward pass through the language model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=input_ids)
    
    # Compute perplexity from the loss
    loss = outputs.loss
    perplexity = torch.exp(loss)
    
    return perplexity.item()

test_df['Perplexity Score'] = test_df['text'].apply(calculate_perplexity) 
test_df

# **Discourse markers**
Discourse markers: words/phrases used to connect, manage, and organize ideas without changing their original meaning (e.g. however, therefore, anyway, secondly, etc) \
Source: https://www.eapfoundation.com/vocab/academic/other/dcl/ (list of 632 discourse connectors)

In [None]:
dcl_list = ['and','or','also','much', 'then','again', 'too', 'increasingly', 'similarly', 'further', 'namely', 'thus', 'indeed',
 'e.g.', 'specifically', 'i.e.', 'especially', 'usually', 'certainly', 'generally', 'extremely', 'mostly', 'actually', 'basically',
 'surely', 'inevitably', 'clearly', 'approximately', 'resembling', 'exactly', 'unlike', 'because', 'therefore', 'if', 'somehow',
 'hence', 'consequently', 'thereby', 'otherwise', 'since', 'however', 'while', 'althoughthough', 'instead', 'yet', 'worse',
 'except', 'whereas', 'albeit', 'admittedly', 'sometimes', 'until', 'eventually', 'constantly', 'meanwhile', 'beforehand',
 'meantime', 'overall', 'finally', 'firstly', 'secondly', 'lastly', 'conclude', 'summarize', 'altogether', 'briefly']

dcl_count_list = []
test_df['split_text'] = test_df['text'].str.split(' ')

# Iterate through each word in the DataFrame
for text_list in test_df['split_text']:
    dcl_count = 0
    # Check if the word is in the word list
    for word in text_list:
        if word in dcl_list:
        # Increment counter if the word is found
            dcl_count += 1
    dcl_count_list.append(dcl_count)
    
test_df['no_discourse_markers'] = dcl_count_list
test_df

# **Absolute number of personal pronouns**

In [None]:
pronouns_list = ['i','you','he','him','her','she','they','them','it','we','me','us']

pronouns_count_list = []
test_df['split_text'] = test_df['text'].str.split(' ')

# Iterate through each word in the DataFrame
for text_list in test_df['split_text']:
    pronouns_count = 0
    # Check if the word is in the word list
    for word in text_list:
        if word in pronouns_list:
        # Increment counter if the word is found
            pronouns_count += 1
    pronouns_count_list.append(pronouns_count)
    
# Display the count
test_df['no_pronouns'] = pronouns_count_list
test_df
#test_df.drop(['B', 'C'], axis=1)

# **Mean/stdev of words/unique words per sentence & absolute number of personal pronouns**

In [None]:
test_df['mean_words_per_sentence'] = test_df['text'].str.split(' ').str.len()/test_df['text'].str.split('.').str.len()
test_df

# **Extract grammatical errors**

In [None]:
import re

# Define regular expressions for common grammatical errors
error_patterns = [
    (r'\b\w+(?:\'[st])\b', "Use 's or 'd instead of '"),
    (r'\bis\s+(?:\w+ing\b|\w+\s)', "Use the correct form of the verb after 'is'"),
    (r'\b(?:\w+ed|\w+en)\s+\w+', "Check for correct verb forms (past participles)"),
    (r'\b(?:a|an)\s+[aeiou]', "Use 'a' or 'an' correctly before a word"),
    (r'\b(?:too|to|two)\b', "Use the correct form of 'to'"),
    (r'\b(?:their|there|they\'re)\b', "Use the correct form of 'their', 'there', or 'they're'"),
    (r'\b(?:your|you\'re)\b', "Use the correct form of 'your' or 'you're'"),
    (r'\b(?:its|it\'s)\b', "Use the correct form of 'its' or 'it's'")
]

# Function to detect grammatical errors using regular expressions
def detect_grammatical_errors(text):
    errors = []
    for pattern, description in error_patterns:
        matches = re.findall(pattern, text)
        if matches:
            errors.extend([(match, description) for match in matches])
    return len(errors)

# Apply function to DataFrame
test_df['grammatical_errors'] = test_df['text'].apply(detect_grammatical_errors)

print(test_df.head())

# **Named Entity Recognition Count**

In [None]:
!pip install spacy
import spacy
# python -m spacy download en_core_web_sm

In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Function to perform named entity recognition and count named entities
def count_named_entities(text):
    doc = nlp(text)
    named_entity_counts = {}
    for ent in doc.ents:
        entity_type = ent.label_
        if entity_type in named_entity_counts:
            named_entity_counts[entity_type] += 1
        else:
            named_entity_counts[entity_type] = 1
    return len(named_entity_counts)

# Apply function to DataFrame
test_df['named_entity_counts'] = test_df['text'].apply(count_named_entities)

print(test_df.head())

In [None]:
test_df.to_csv("feature_output_10k.csv")
print('Y')