In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Step 1: Load PCA-Transformed Data
# Load the saved PCA-transformed data
data_pca = pd.read_csv('pca_transformed_data.csv')

# Separate features (X) and target labels (y)
X = data_pca.drop(columns=['label'])  # All PCA components as features
y = data_pca['label']  # Target labels
print("PCA-transformed data loaded successfully.")

# Step 2: Split Data into Training and Testing Sets
# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Train the Classifier
# Initialize the Random Forest classifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)
print("Model training completed.")

# Step 4: Evaluate the Classifier
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy and classification metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Human-written', 'AI-generated']))

# Step 5: Save the Trained Model
# Save the trained classifier
joblib.dump(clf, 'random_forest_trained_model.pkl')
print("Trained Random Forest model saved as 'random_forest_trained_model.pkl'.")


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Step 1: Load PCA-Transformed Data (top 2 components)
# Load the saved PCA-transformed data for top 2 components
data_pca_top2 = pd.read_csv('pca_top2_transformed_data.csv')

# Separate features (X) and target labels (y)
X = data_pca_top2[['PC1', 'PC2']]  # Use only the top 2 PCA components
y = data_pca_top2['label']

print("PCA-transformed data (top 2 components) loaded successfully.")

# Step 2: Split Data into Training and Testing Sets
# Perform an 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Train the Classifier
# Initialize the Random Forest classifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)
print("Model training completed.")

# Step 4: Evaluate the Classifier
# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy and classification metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Top 2 PCA Components: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Human-written', 'AI-generated']))

# Step 5: Save the Trained Model
# Save the trained classifier
joblib.dump(clf, 'random_forest_top2_model.pkl')
print("Trained Random Forest model (top 2 PCA components) saved as 'random_forest_top2_model.pkl'.")
