In [None]:
import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from datasets import load_dataset

# Step 1: Load dataset from Hugging Face
dataset = datasets.load_dataset("browndw/human-ai-parallel-corpus-biber", split="train")
df = pd.DataFrame(dataset)

In [None]:
# Check first few rows of `doc_id`
print(df[['doc_id']].head())

In [None]:
# Step 2: Preprocess the dataset
# Extracting `source` from `doc_id`
df['source'] = df['doc_id'].apply(lambda x: x.split("@")[-1])
# Extract genre from `doc_id`
df['genre'] = df['doc_id'].apply(lambda x: x.split("@")[0].split("_")[0] if isinstance(x, str) else "unknown")

In [None]:
print(df.columns)

In [None]:
# Check unique sources
unique_sources = df['source'].unique()
print(unique_sources)

In [None]:
# Check unique genres
unique_genres = df['genre'].unique()
print(unique_genres)

In [None]:
# Feature selection
feature_cols = [col for col in df.columns if col.startswith('f_')]

In [None]:
print("Selected features:")
print(feature_cols)

In [None]:
df.info()

In [None]:
# Check for missing feature values in any row
missing_values = df[feature_cols].isnull().any(axis=1)
print(missing_values)

In [None]:
print(missing_values.sum())  # Number of rows with at least one NaN

In [None]:
# df = df.dropna(subset=feature_cols)  # Drop rows with missing feature values

In [None]:
# Step 1: Remove 'chunk_1' from the dataset
df_filtered = df[df['source'] != "chunk_1"].copy()

# Step 2: Convert source to a categorical data type and encode as numeric AFTER filtering
df_filtered['source_encoded'] = df_filtered['source'].astype('category').cat.codes

# Step 3: Redefine `source_mapping` AFTER filtering (so it excludes `chunk_1`)
source_mapping = dict(enumerate(df_filtered['source'].astype('category').cat.categories))

# Step 4: Ensure correct target column
X = df_filtered[feature_cols]  # Features
y = df_filtered['source_encoded']  # Encoded labels

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y  # Ensures balanced class distribution
)

In [None]:
# Train Random Forest Model
rf = RandomForestClassifier(n_estimators=500, max_features=8, random_state=42)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
# Make predictions
y_pred = rf.predict(X_test)

# Convert numeric predictions back to text labels
y_test_labels = y_test.map(source_mapping)  # Actual labels
y_pred_labels = pd.Series(y_pred).map(source_mapping)  # Predicted labels

# Print classification report
print(classification_report(y_test_labels, y_pred_labels))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_labels, y_pred_labels)
labels = sorted(y_test_labels.unique())

plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
feature_importance = pd.DataFrame({'Feature': feature_cols, 'Importance': rf.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance[:10], palette="Blues_r")
plt.xlabel("Feature Importance Score")
plt.ylabel("Top Features")
plt.title("Top 10 Important Features")
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=1000, max_features=10, max_depth=20, random_state=42)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,  # Reduce trees (was 500+)
    max_depth=15,  # Limit tree depth to prevent overfitting
    max_features="sqrt",  # Use square root of features for each split
    min_samples_split=5,  # Require at least 5 samples to split
    min_samples_leaf=3,  # Ensure at least 3 samples in leaf nodes
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,  
    max_features="sqrt",  
    min_samples_split=10,  # More conservative splits
    min_samples_leaf=5,  # Ensure at least 5 samples in leaf nodes
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=15,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=10,  # Force even smoother leaf nodes  
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=12,  # Reduce tree depth for smoother decision boundaries  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=10,  
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=12,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  # Allow finer granularity  
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,  # More trees for better generalization  
    max_depth=12,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,  
    max_depth=12,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  
    class_weight="balanced",  # Adjusts weights dynamically  
    random_state=42
)
rf.fit(X_train, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test, y_test):.4f}")

In [None]:
feature_importance = pd.DataFrame({
    'Feature': feature_cols, 
    'Importance': rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importance.head(15))  # Check top 15 important features

# Drop low-importance features
important_features = feature_importance[feature_importance['Importance'] > 0.005]["Feature"]
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

rf.fit(X_train_selected, y_train)

In [None]:
# Select important features for training
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]

# Train on selected features
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=12,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  
    random_state=42
)
rf.fit(X_train_selected, y_train)

# Evaluate on test set
test_accuracy = rf.score(X_test_selected, y_test)
print(f"Test Accuracy (After Feature Selection): {test_accuracy:.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=10,  # Reduce tree depth further  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  
    random_state=42
)
rf.fit(X_train_selected, y_train)

In [None]:
print(f"Training Accuracy: {rf.score(X_train_selected, y_train):.4f}")
print(f"Test Accuracy: {rf.score(X_test_selected, y_test):.4f}")

In [None]:
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=10,  
    min_samples_split=10,  
    min_samples_leaf=5,  
    max_features="sqrt",  
    random_state=42
)

cv_scores = cross_val_score(rf, X_train_selected, y_train, cv=5)
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,  
    max_depth=15,  
    max_features="sqrt",  
    min_samples_split=10,  
    min_samples_leaf=5,  
    random_state=42
)
rf.fit(X_train, y_train)

# Evaluate on test set
test_accuracy = rf.score(X_test, y_test)
print(f"Test Accuracy (All Features, max_depth=15): {test_accuracy:.4f}")

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,  
    max_depth=10,  
    learning_rate=0.05,  # Adjust learning rate  
    subsample=0.8,  # Randomly sample 80% of data per tree  
    colsample_bytree=0.8,  # Use 80% of features per tree  
    random_state=42
)

xgb.fit(X_train, y_train)
test_accuracy = xgb.score(X_test, y_test)
print(f"XGBoost Test Accuracy: {test_accuracy:.4f}")

In [None]:
import tensorflow as tf
from tensorflow import keras

# Define a simple MLP model
model = keras.Sequential([
    keras.layers.Dense(256, activation="relu", input_shape=(X_train.shape[1],)),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(len(y_train.unique()), activation="softmax")  # Multi-class classification
])

# Compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    XGBClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=10,  # Randomly test 10 different combinations
    cv=3,  # Reduce cross-validation folds
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)

In [None]:
from xgboost import XGBClassifier

# Train XGBoost with best parameters from RandomizedSearchCV
xgb_optimized = XGBClassifier(
    subsample=0.8,  
    n_estimators=300,  
    max_depth=8,  
    learning_rate=0.1,  
    colsample_bytree=0.8,  
    random_state=42
)

xgb_optimized.fit(X_train, y_train)

# Evaluate on test set
test_accuracy = xgb_optimized.score(X_test, y_test)
print(f"Optimized XGBoost Test Accuracy: {test_accuracy:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions
y_pred = xgb_optimized.predict(X_test)

# Convert numeric predictions back to text labels
y_test_labels = y_test.map(source_mapping)  
y_pred_labels = pd.Series(y_pred).map(source_mapping)

# Print classification report
print(classification_report(y_test_labels, y_pred_labels))

# Compute confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels)
unique_labels = sorted(y_test_labels.unique())

# Plot Confusion Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Optimized XGBoost Confusion Matrix")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
feature_importance = pd.DataFrame({
    "Feature": X_train.columns, 
    "Importance": xgb_optimized.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot top 10 features
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance[:10])
plt.xlabel("Feature Importance Score")
plt.ylabel("Top Features")
plt.title("Top 10 Important Features (XGBoost)")
plt.show()

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

# Create stacking ensemble
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(subsample=0.8, n_estimators=300, max_depth=8, learning_rate=0.1, colsample_bytree=0.8, random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42))
    ],
    final_estimator=XGBClassifier(n_estimators=100, random_state=42),  # Meta-model
    cv=3
)

stacking_model.fit(X_train, y_train)

# Evaluate
test_accuracy_stacking = stacking_model.score(X_test, y_test)
print(f"Stacking Model Test Accuracy: {test_accuracy_stacking:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels)
labels = sorted(y_test_labels.unique())  # Ensure correct label order

# Plot Confusion Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(10, 6))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Normalized Confusion Matrix")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
feature_importance = pd.DataFrame({
    "Feature": X_train.columns, 
    "Importance": xgb_optimized.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot top 10 features
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance[:10])
plt.xlabel("Feature Importance Score")
plt.ylabel("Top Features")
plt.title("Top 10 Important Features (XGBoost)")
plt.show()

In [None]:
import shap

# Create SHAP explainer
explainer = shap.Explainer(xgb_optimized)
shap_values = explainer(X_test)

# SHAP Summary Plot
shap.summary_plot(shap_values, X_test)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
import numpy as np

# Aggregate predictions per class
predicted_class_counts = pd.Series(y_pred_labels).value_counts(normalize=True)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=predicted_class_counts.index, y=predicted_class_counts.values)
plt.xlabel("Predicted Class")
plt.ylabel("Proportion")
plt.title("Prediction Distribution Across LLMs & Humans")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_test)

# Convert to DataFrame
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["Source"] = y_test_labels.values

# Plot PCA
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Source", alpha=0.7, palette="tab10")
plt.title("PCA: Classification Separation by Source")
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Reduce to 5 components
pca = PCA(n_components=5)
pca.fit(X_test)
print(pca.explained_variance_ratio_)  # Check variance captured by each component

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(range(1, 6), pca.explained_variance_ratio_, marker='o', linestyle='--')
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")
plt.title("PCA Scree Plot")
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_test)
pca_df_3d = pd.DataFrame(X_pca_3d, columns=["PC1", "PC2", "PC3"])
pca_df_3d["Source"] = y_test_labels.values

fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(pca_df_3d["PC1"], pca_df_3d["PC2"], pca_df_3d["PC3"], 
                     c=y_test.astype("category").cat.codes, cmap="tab10", alpha=0.7)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.title("3D PCA Classification Separation")
plt.show()

In [None]:
from sklearn.manifold import TSNE

X_tsne = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_test)
tsne_df = pd.DataFrame(X_tsne, columns=["t-SNE1", "t-SNE2"])
tsne_df["Source"] = y_test_labels.values

plt.figure(figsize=(10,6))
sns.scatterplot(data=tsne_df, x="t-SNE1", y="t-SNE2", hue="Source", alpha=0.7, palette="tab10")
plt.title("t-SNE: Classification Separation by Source")
plt.legend(bbox_to_anchor=(1,1))
plt.show()