In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the datasets
train_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_train.csv')  # Update with your file path
dev_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_dev.csv')  # Update with your file path
test_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_test_without_labels.csv') # Update with your file path

# Preprocess training and development data: Remove rows with missing values
train_df.dropna(subset=['Text', 'Class'], inplace=True)  # Remove rows with missing values in Text or Class columns
dev_df.dropna(subset=['Text', 'Class'], inplace=True)    # Same for dev dataset

# Ensure labels ('Class') are either 1 for 'Abusive' or 0 for 'Non-Abusive'
train_df['Class'] = train_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})
dev_df['Class'] = dev_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})

# Check if there are any NaN values in target variable 'Class'
if train_df['Class'].isnull().sum() > 0:
    print("NaN values found in 'Class' column of train dataset. Removing rows with NaN labels.")
    train_df.dropna(subset=['Class'], inplace=True)

if dev_df['Class'].isnull().sum() > 0:
    print("NaN values found in 'Class' column of dev dataset. Removing rows with NaN labels.")
    dev_df.dropna(subset=['Class'], inplace=True)

# Prepare the data for TF-IDF vectorization
X_train = train_df['Text']
y_train = train_df['Class']

# Split training data into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text'])

# Train Logistic Regression model
log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train_tfidf, y_train)

# Evaluate on validation data
val_predictions = log_reg_model.predict(X_val_tfidf)

val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.4f}")

print("Validation Set Classification Report:")
print(classification_report(y_val, val_predictions, target_names=['Non-Abusive', 'Abusive']))

# Predict on test data
test_predictions = log_reg_model.predict(X_test_tfidf)

# Save predictions to the test data
test_df['predicted_class'] = test_predictions
test_df['predicted_class'] = test_df['predicted_class'].map({0: 'Non-Abusive', 1: 'Abusive'})

# Save the test predictions to a CSV file
output_file_path = 'C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab\\ATRFC001_Predictions.csv'
test_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to '{output_file_path}'.")


NaN values found in 'Class' column of train dataset. Removing rows with NaN labels.
Validation Accuracy: 0.6810
Validation Set Classification Report:
              precision    recall  f1-score   support

 Non-Abusive       0.67      0.72      0.70       281
     Abusive       0.69      0.64      0.67       277

    accuracy                           0.68       558
   macro avg       0.68      0.68      0.68       558
weighted avg       0.68      0.68      0.68       558

Predictions saved to 'C:\Users\DHARANI SINDHU\OneDrive\Documents\Codelab\ATRFC001_Predictions.csv'.
