In [1]:

print(df["gpt2_output_probability"].head(10))

print(df["gpt2_output_probability"].describe())

print(f"Non-null values: {df['gpt2_output_probability'].notna().sum()} / {len(df)}")

print(df[["text", "perplexity", "gpt2_output_probability"]].head(20))


nltk.download('stopwords', download_dir='/Users/lailaalmohaymid/nltk_data')

#uploading stopword
url = "https://raw.githubusercontent.com/mohataher/arabic-stop-words/master/list.txt"
response = requests.get(url)
arabic_stopwords = set(response.text.strip().split('\n'))

arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
stemmer = ISRIStemmer()

def remove_diacritics(text):
    return re.sub(arabic_diacritics, '', text)

def normalize_arabic(text):
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'[^؀-ۿ ]+', ' ', text)
    return text

def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

df["abstract_text_clean"] = df["text"].apply(preprocess_text)
print(f"Number of stopwords: {len(arabic_stopwords)}")

print(f"\n stopwords:")
print(list(arabic_stopwords))

sample_text = df["text"].iloc[0]
clean_text = df["abstract_text_clean"].iloc[0]

print(f"\n")
print("Original text:")
print(sample_text[:200])
print(f"\n")
print("Cleaned text:")
print(clean_text[:200])

print(f"\n")
print("Statistics:")
print(f"Average original text length: {df['text'].str.len().mean():.0f} characters")
print(f"Average cleaned text length: {df['abstract_text_clean'].str.len().mean():.0f} characters")
print(f"Reduction percentage: {(1 - df['abstract_text_clean'].str.len().mean() / df['text'].str.len().mean()) * 100:.1f}%")

print(f"\nNull values in cleaned text: {df['abstract_text_clean'].isna().sum()}")
print(df['label'])

def calculate_avg_word_length(texts):
    total_length = 0
    total_words = 0

    for text in texts:
        if pd.notna(text) and str(text).strip():
            words = str(text).split()
            for word in words:
                total_length += len(word)
                total_words += 1

    return total_length / total_words if total_words > 0 else 0

def calculate_avg_sentence_length(texts):
    total_words = 0
    total_sentences = 0

    for text in texts:
        if pd.notna(text) and str(text).strip():
            sentences = str(text).split('.')
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    words = sentence.split()
                    total_words += len(words)
                    total_sentences += 1

    return total_words / total_sentences if total_sentences > 0 else 0

def calculate_type_token_ratio(texts):
    all_words = []

    for text in texts:
        if pd.notna(text) and str(text).strip():
            words = str(text).split()
            all_words.extend(words)

    if len(all_words) == 0:
        return 0

    unique_words = set(all_words)
    return len(unique_words) / len(all_words)

human_texts = df[df['label'] == 'human']['abstract_text_clean']
ai_texts = df[df['label'] == 'ai']['abstract_text_clean']

human_avg = calculate_avg_word_length(human_texts)
ai_avg = calculate_avg_word_length(ai_texts)

human_sent_len = calculate_avg_sentence_length(human_texts)
ai_sent_len = calculate_avg_sentence_length(ai_texts)

human_ttr = calculate_type_token_ratio(human_texts)
ai_ttr = calculate_type_token_ratio(ai_texts)

print(f"Human avg word length: {human_avg:.2f}")
print(f"AI avg word length: {ai_avg:.2f}")
print(f"\nHuman avg sentence length: {human_sent_len:.2f} words")
print(f"AI avg sentence length: {ai_sent_len:.2f} words")
print(f"\nHuman Type-Token Ratio: {human_ttr:.4f}")
print(f"AI Type-Token Ratio: {ai_ttr:.4f}")
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

print("Total:", len(df))
print("Train:", len(train_df))
print("Validation:", len(val_df))
print("Test:", len(test_df))
print(df.columns)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(df.columns)

Index(['text', 'label', 'words', 'word_length_frequency'], dtype='object')


In [4]:
#Average number of S/P
def split_paragraphs(text):
    text = str(text).strip()
    paragraphs = [p.strip() for p in text.split(r'\s*\n\s*\n\s*|\s*\r\n\s*\r\n\s*') if p.strip()]
    return paragraphs if paragraphs else [text] if text else []

def split_sentences(text):
    text = str(text).strip()
    parts = re.split(r'(?<=[\.\?\!\u061F\u061B])\s+', text)
    sentences = [s.strip() for s in parts if s.strip()]
    return sentences

df["paragraphs"] = df["text"].apply(split_paragraphs)
df["sentences"] = df["text"].apply(split_sentences)

df["Total number of sentences"] = df["sentences"].apply(len)
df["Total number of paragraphs "] = df["paragraphs"].apply(len)

def compute_avg_S_per_P(row):
    S = row["Total number of sentences"]
    P = row["Total number of paragraphs "]
    return S / P if P > 0 else 0

df["Average number of S/P"] = df.apply(compute_avg_S_per_P, axis=1)


In [7]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

if torch.cuda.is_available():
    device = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Using device:", device)
model.to(device)

def calculate_perplexity(text):
    text = "" if text is None else str(text).strip()
    if not text:
        return None

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    )
    input_ids = inputs["input_ids"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

    return torch.exp(loss).item()

df["perplexity"] = df["text"].apply(calculate_perplexity)

Using device: mps


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


KeyboardInterrupt: 

In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # إضافة هذا السطر
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)
model.eval()

def calculate_perplexity_batch(texts, batch_size=16, max_length=256):
    results = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        cleaned_batch = []
        batch_indices = []

        for idx, t in enumerate(batch_texts):
            if pd.notna(t) and str(t).strip():
                cleaned_batch.append(str(t))
                batch_indices.append(idx)

        if not cleaned_batch:
            results.extend([None] * len(batch_texts))
            continue

        try:
            inputs = tokenizer(
                cleaned_batch,
                return_tensors='pt',
                truncation=True,
                max_length=max_length,
                padding=True
            )

            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            with torch.no_grad():
                batch_perplexities = []
                for j in range(len(cleaned_batch)):
                    single_input_ids = input_ids[j:j+1]
                    single_attention_mask = attention_mask[j:j+1]

                    single_output = model(
                        single_input_ids,
                        attention_mask=single_attention_mask,
                        labels=single_input_ids
                    )

                    loss = single_output.loss
                    perplexity = torch.exp(loss).item()
                    batch_perplexities.append(perplexity)

                batch_results = [None] * len(batch_texts)
                for idx, ppl in zip(batch_indices, batch_perplexities):
                    batch_results[idx] = ppl

                results.extend(batch_results)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            results.extend([None] * len(batch_texts))

    return results

perplexities = calculate_perplexity_batch(df["text"].tolist(), batch_size=16, max_length=256)
df["perplexity"] = perplexities

100%|██████████| 2283/2283 [1:03:39<00:00,  1.67s/it]   


In [24]:
import pandas as pd
import torch
from tqdm import tqdm

def calculate_average_log_prob_batch(texts, batch_size=16, max_length=256):
    results = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        cleaned_batch = []
        batch_indices = []

        for idx, t in enumerate(batch_texts):
            if pd.notna(t) and str(t).strip():
                cleaned_batch.append(str(t))
                batch_indices.append(idx)

        if not cleaned_batch:
            results.extend([None] * len(batch_texts))
            continue

        try:
            inputs = tokenizer(
                cleaned_batch,
                return_tensors='pt',
                truncation=True,
                max_length=max_length,
                padding=True
            )

            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            with torch.no_grad():
                batch_probabilities = []
                for j in range(len(cleaned_batch)):
                    single_input_ids = input_ids[j:j+1]
                    single_attention_mask = attention_mask[j:j+1]

                    if single_input_ids.shape[1] <= 1:
                        batch_probabilities.append(None)
                        continue

                    single_output = model(
                        single_input_ids,
                        attention_mask=single_attention_mask,
                        labels=single_input_ids
                    )

                    average_nll = single_output.loss.item()
                    average_log_prob = -average_nll
                    average_probability = torch.exp(torch.tensor(average_log_prob)).item()
                    batch_probabilities.append(average_probability)

                batch_results = [None] * len(batch_texts)
                for idx, prob in zip(batch_indices, batch_probabilities):
                    batch_results[idx] = prob

                results.extend(batch_results)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            results.extend([None] * len(batch_texts))

    return results

probabilities = calculate_average_log_prob_batch(df["text"].tolist(), batch_size=16, max_length=256)
df["gpt2_output_probability"] = probabilities

100%|██████████| 2283/2283 [21:22<00:00,  1.78it/s]  


In [30]:
import nltk
nltk.download('stopwords', download_dir='/Users/lailaalmohaymid/nltk_data')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


False

In [41]:
print(f"Number of stopwords: {len(arabic_stopwords)}")

print(f"\n stopwords:")
print(list(arabic_stopwords))

sample_text = df["text"].iloc[0]
clean_text = df["abstract_text_clean"].iloc[0]

print(f"\n")
print("Original text:")
print(sample_text[:200])
print(f"\n")
print("Cleaned text:")
print(clean_text[:200])

print(f"\n")
print("Statistics:")
print(f"Average original text length: {df['text'].str.len().mean():.0f} characters")
print(f"Average cleaned text length: {df['abstract_text_clean'].str.len().mean():.0f} characters")
print(f"Reduction percentage: {(1 - df['abstract_text_clean'].str.len().mean() / df['text'].str.len().mean()) * 100:.1f}%")

print(f"\nNull values in cleaned text: {df['abstract_text_clean'].isna().sum()}")

Number of stopwords: 799

 stopwords:
['ألا', 'خمسون', 'لهم', 'ليت', 'نحن', 'شين', 'أفريل', 'ذهب', 'واكد', 'أربعمئة', 'ضمن', 'لديه', 'أرى', 'أوشك', 'سبحان', 'خ', 'نفسها', 'قاطبة', 'كلَّا', 'نوفمبر', 'يمين', 'تسعين', 'فرادى', 'ماانفك', 'ذات', 'مذ', 'ثمان', 'وهذا', 'اى', 'تفعلين', 'أطعم', 'تم', 'دواليك', 'اربعين', 'اجل', 'يوم', 'ينبغي', 'لمّا', 'أصبح', 'هنا', 'مئتان', 'كانت', 'هى', 'واحد', 'اثر', 'إما', 'أنّى', 'اكد', 'عدة', 'سوى', 'ثمانون', 'إذ', 'بلى', 'وهي', 'ثلاثة', 'ث', 'لكنه', 'ص', 'هيا', 'الألاء', 'عدَّ', 'لام', 'ا?', 'فاء', 'زاي', 'لو', 'بات', 'أنتن', 'تفعلون', 'أعلم', 'ئ', 'كاد', 'ثمّ', 'آناء', 'فيفري', 'اذا', 'كان', 'ين', 'الاولى', 'أين', 'ه', 'ثمّة', 'الذى', 'ثلاث', 'سبعة', 'سبعمئة', 'قد', 'قوة', 'إذما', 'انبرى', 'حتى', 'إياكم', 'كلّما', 'س', 'ساء', 'لدى', 'أل', 'الذي', 'هي', 'أبو', 'التي', 'أربعمائة', 'أمامكَ', 'أيّان', 'هَاتانِ', 'تعلَّم', 'نبَّا', 'وفي', 'انه', 'بغتة', 'ك', 'مثل', 'عين', 'رابع', 'كثيرا', 'الف', 'نفسي', 'وبين', 'آمينَ', 'م', 'امسى', 'حاي', 'حَذارِ', 'بيد', '

In [44]:

def calculate_avg_word_length(texts):
    total_length = 0
    total_words = 0

    for text in texts:
        if pd.notna(text) and str(text).strip():
            words = str(text).split()
            for word in words:
                total_length += len(word)
                total_words += 1

    return total_length / total_words if total_words > 0 else 0

def calculate_avg_sentence_length(texts):
    total_words = 0
    total_sentences = 0

    for text in texts:
        if pd.notna(text) and str(text).strip():
            sentences = str(text).split('.')
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    words = sentence.split()
                    total_words += len(words)
                    total_sentences += 1

    return total_words / total_sentences if total_sentences > 0 else 0

def calculate_type_token_ratio(texts):
    all_words = []

    for text in texts:
        if pd.notna(text) and str(text).strip():
            words = str(text).split()
            all_words.extend(words)

    if len(all_words) == 0:
        return 0

    unique_words = set(all_words)
    return len(unique_words) / len(all_words)

human_texts = df[df['label'] == 'human']['abstract_text_clean']
ai_texts = df[df['label'] == 'ai']['abstract_text_clean']

human_avg = calculate_avg_word_length(human_texts)
ai_avg = calculate_avg_word_length(ai_texts)

human_sent_len = calculate_avg_sentence_length(human_texts)
ai_sent_len = calculate_avg_sentence_length(ai_texts)

human_ttr = calculate_type_token_ratio(human_texts)
ai_ttr = calculate_type_token_ratio(ai_texts)

print(f"Human avg word length: {human_avg:.2f}")
print(f"AI avg word length: {ai_avg:.2f}")
print(f"\nHuman avg sentence length: {human_sent_len:.2f} words")
print(f"AI avg sentence length: {ai_sent_len:.2f} words")
print(f"\nHuman Type-Token Ratio: {human_ttr:.4f}")
print(f"AI Type-Token Ratio: {ai_ttr:.4f}")

Human avg word length: 3.35
AI avg word length: 3.32

Human avg sentence length: 87.24 words
AI avg sentence length: 77.45 words

Human Type-Token Ratio: 0.0610
AI Type-Token Ratio: 0.0082


In [47]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

print("Total:", len(df))
print("Train:", len(train_df))
print("Validation:", len(val_df))
print("Test:", len(test_df))

TOTAL: 36525
Train: 25567
Validation: 5479
Test: 5479
