In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/mal_training_data_hum_ai.csv')

# Display first few rows of the dataset
print(df.head())

# Step 2: Preprocess the text data
import re
from nltk.corpus import stopwords

# Define a preprocessing function
def preprocess_text(text):
    # Remove unwanted characters (punctuations, special characters)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase (if needed for Tamil)
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the DATA column
df['DATA'] = df['DATA'].apply(preprocess_text)

# Step 3: Split the dataset into training and testing sets
X = df['DATA']  # Text data
y = df['LABEL']  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Convert text into numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), analyzer='char')  # Character-level n-grams for Tamil
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Build and train a classification model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import re

In [None]:
# Step 1: Load the dataset
def load_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        if "DATA" not in df.columns or "LABEL" not in df.columns:
            raise ValueError("Dataset must contain 'DATA' and 'LABEL' columns.")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Preprocessing function
def preprocess_text(text):
    # Remove unwanted characters (punctuations, special characters)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase (if needed for Tamil)
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load and preprocess dataset
file_path = '/content/drive/MyDrive/tam_training_data_hum_ai.csv'
df = load_dataset(file_path)
if df is not None:
    df['DATA'] = df['DATA'].fillna('').apply(preprocess_text)  # Fill NaN with empty strings

    # Map labels to integers
    label_mapping = {'AI': 0, 'HUMAN': 1}
    if not set(df['LABEL'].unique()).issubset(label_mapping.keys()):
        raise ValueError("Invalid labels in the dataset. Expected labels are: 'AI' and 'HUMAN'.")
    df['LABEL'] = df['LABEL'].map(label_mapping)

    # Convert to Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # Cast LABEL column to ClassLabel
    class_label = ClassLabel(num_classes=2, names=["AI", "HUMAN"])
    dataset = dataset.cast_column("LABEL", class_label)

    # Perform train-test split
    train_test = dataset.train_test_split(test_size=0.2, stratify_by_column="LABEL")
    train_test = DatasetDict({"train": train_test["train"], "test": train_test["test"]})

    # Step 3: Load the tokenizer and model
    model_name = "bert-base-multilingual-cased"  # Change to Indic-specific models if required
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Tokenization function
    def tokenize_function(examples):
        return tokenizer(examples["DATA"], padding="max_length", truncation=True, max_length=256)

    tokenized_datasets = train_test.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["DATA"])  # Only remove the 'DATA' column
    tokenized_datasets = tokenized_datasets.rename_column("LABEL", "labels")
    tokenized_datasets.set_format("torch")

    # Step 4: Define metrics
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        precision = precision_score(labels, preds)
        recall = recall_score(labels, preds)
        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

        # Step 5: Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",  # Evaluate at the end of each epoch
        save_strategy="epoch",  # Save the model at the end of each epoch
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model="f1",
        report_to="none"  # Disable integration with platforms like Weights & Biases or TensorBoard if not needed
    )

    # Step 6: Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Step 7: Evaluate on test set
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)
else:
    print("Failed to load the dataset. Please check the file path and format.")

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the test dataset
def load_test_dataset(file_path):
    try:
        df = pd.read_excel(file_path)
        if "ID" not in df.columns or "DATA" not in df.columns:
            raise ValueError("Test dataset must contain 'ID' and 'DATA' columns.")
        return df
    except Exception as e:
        print(f"Error loading test dataset: {e}")
        return None

# Preprocess the text
def preprocess_text(text):
    import re
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = text.lower()                  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

def predict_labels(test_file_path, model_path, tokenizer_path):
    test_df = load_test_dataset(test_file_path)
    if test_df is None:
        return

    test_df['DATA'] = test_df['DATA'].fillna('').apply(preprocess_text)  # Handle missing data

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()  # Set model to evaluation mode

    tokenized_inputs = tokenizer(
        test_df['DATA'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**tokenized_inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    label_mapping = {0: "AI", 1: "HUMAN"}  # Adjust based on your training setup
    test_df['PREDICTED_LABEL'] = predictions.numpy()
    test_df['PREDICTED_LABEL'] = test_df['PREDICTED_LABEL'].map(label_mapping)

    output_file = "predicted_test_results.csv"
    test_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [None]:
test_file_path = '/content/drive/MyDrive/tam_test_data_hum_ai.xlsx'
model_path = '/content/saved_model'
tokenizer_path = '/content/saved_model'
predict_labels(test_file_path, model_path, tokenizer_path)

In [None]:
import pandas as pd
import re

test_df = pd.read_excel('/content/drive/MyDrive/mal_test_data_hum_ai.xlsx')

# Ensure the test dataset contains 'ID' and 'DATA' columns
if 'ID' not in test_df.columns or 'DATA' not in test_df.columns:
    raise ValueError("Test dataset must contain 'ID' and 'DATA' columns.")

In [None]:
# Step 2: Preprocess the text data
def preprocess_text(text):
    # Remove unwanted characters (punctuations, special characters)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase (useful for case-insensitive languages)
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to the 'DATA' column
test_df['DATA'] = test_df['DATA'].fillna('').apply(preprocess_text)  # Fill missing values with empty strings

# Step 3: Transform the test data using the trained TF-IDF vectorizer
X_test = test_df['DATA']

try:
    X_test_tfidf = vectorizer.transform(X_test)  # Use the TF-IDF vectorizer trained on training data
except NameError:
    raise ValueError("TF-IDF vectorizer is not defined. Ensure it's loaded from the training script.")

# Step 4: Predict labels for the test dataset
try:
    y_pred = model.predict(X_test_tfidf)  # Use the trained model
except NameError:
    raise ValueError("Model is not defined. Ensure it's loaded from the training script.")

# Step 5: Add predictions to the test dataset
test_df['PREDICTED_LABEL'] = y_pred

# Map predicted labels back to their original names if necessary (optional)
# Uncomment and modify if you used encoded labels during training
# label_mapping = {0: 'AI', 1: 'HUMAN'}
# test_df['PREDICTED_LABEL'] = test_df['PREDICTED_LABEL'].map(label_mapping)

# Step 6: Save predictions to a CSV file
output_file = '/content/drive/MyDrive/test_predictions_1.csv'  # Specify the output path
test_df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file}")

# Step 7: Display a few predictions (optional)
print(test_df[['ID', 'DATA', 'PREDICTED_LABEL']].head())

print(df['LABEL'].value_counts())

In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, ClassLabel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch

# Step 1: Load and preprocess the dataset
def load_and_preprocess_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        if "DATA" not in df.columns or "LABEL" not in df.columns:
            raise ValueError("Dataset must contain 'DATA' and 'LABEL' columns.")

        # Preprocess text data
        def preprocess_text(text):
            text = re.sub(r'[^஀-௿\w\s]', '', text)  # Remove non-Tamil characters
            text = re.sub(r'\d+', '', text)  # Remove numbers
            text = text.lower()  # Convert to lowercase
            text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
            return text

        df['DATA'] = df['DATA'].fillna('').apply(preprocess_text)

        # Map labels to integers
        label_mapping = {'AI': 0, 'HUMAN': 1}
        df['LABEL'] = df['LABEL'].map(label_mapping)

        # Convert to Hugging Face Dataset
        dataset = Dataset.from_pandas(df)
        class_label = ClassLabel(num_classes=2, names=["AI", "HUMAN"])
        dataset = dataset.cast_column("LABEL", class_label)

        # Split into training and testing sets
        train_test = dataset.train_test_split(test_size=0.2, stratify_by_column="LABEL")
        return DatasetDict({"train": train_test["train"], "test": train_test["test"]})
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Step 2: Tokenization and Model Setup
def tokenize_function(examples):
    return tokenizer(examples["DATA"], padding="max_length", truncation=True, max_length=256)

# Step 3: Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted")
    }

# Step 4: Load Dataset and Tokenizer
file_path = '/content/drive/MyDrive/tam_training_data_hum_ai.csv'
dataset = load_and_preprocess_dataset(file_path)
if dataset is None:
    raise ValueError("Failed to load the dataset.")

model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["DATA"])
tokenized_datasets = tokenized_datasets.rename_column("LABEL", "labels")
tokenized_datasets.set_format("torch")

# Step 5: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

# Step 6: Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 7: Train the Model
trainer.train()

In [None]:
# Save the Model and Tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

# Step 8: Evaluate the Model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Step 9: Predict on New Data
def predict_labels(test_file_path, model_path, tokenizer_path):
    test_df = pd.read_excel(test_file_path)
    if "ID" not in test_df.columns or "DATA" not in test_df.columns:
        raise ValueError("Test dataset must contain 'ID' and 'DATA' columns.")

    test_df['DATA'] = test_df['DATA'].fillna('').apply(preprocess_text)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()

    tokenized_inputs = tokenizer(
        test_df['DATA'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**tokenized_inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    label_mapping = {0: "AI", 1: "HUMAN"}
    test_df['PREDICTED_LABEL'] = predictions.numpy()
    test_df['PREDICTED_LABEL'] = test_df['PREDICTED_LABEL'].map(label_mapping)

    output_file = "predicted_test_results.csv"
    test_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [None]:
test_file_path = '/content/drive/MyDrive/tam_test_data_hum_ai.xlsx'
model_path = './saved_model'
tokenizer_path = './saved_model'
predict_labels(test_file_path, model_path, tokenizer_path)

In [None]:
from sklearn.metrics import classification_report
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")

    # Print the classification report
    report = classification_report(labels, preds, target_names=["AI", "HUMAN"])
    print("Classification Report:\n", report)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [None]:
from torch.utils.data import DataLoader

# Step 8: Manually perform predictions and compute metrics
trainer.model.eval()
eval_preds, eval_labels = [], []

# Get the evaluation DataLoader from the Trainer
eval_dataloader = trainer.get_eval_dataloader()

# Iterate over the DataLoader
for batch in eval_dataloader:
    # Extract the input and labels from the batch
    inputs = {key: value.to(trainer.args.device) for key, value in batch.items() if key != 'labels'}
    labels = batch['labels'].to(trainer.args.device)

    with torch.no_grad():
        # Forward pass
        outputs = trainer.model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

    eval_preds.extend(preds.cpu().numpy())
    eval_labels.extend(labels.cpu().numpy())

In [None]:
# Print the classification report
from sklearn.metrics import classification_report
report = classification_report(eval_labels, eval_preds, target_names=["AI", "HUMAN"])
print("Classification Report:\n", report)