In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
train_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_train.csv')  # Update with your file path
dev_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_dev.csv')  # Update with your file path
test_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWT_test_without_labels.csv')  # Update with your file path

# Preprocess training and development data: Remove rows with missing values
train_df.dropna(subset=['Text', 'Class'], inplace=True)  # Remove rows with missing values in Text or Class columns
dev_df.dropna(subset=['Text', 'Class'], inplace=True)    # Same for dev dataset

# Ensure labels ('Class') are either 1 for 'Abusive' or 0 for 'Non-Abusive'
train_df['Class'] = train_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})
dev_df['Class'] = dev_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})

# Check if there are any NaN values in target variable 'Class'
if train_df['Class'].isnull().sum() > 0:
    print("NaN values found in 'Class' column of train dataset. Removing rows with NaN labels.")
    train_df.dropna(subset=['Class'], inplace=True)

if dev_df['Class'].isnull().sum() > 0:
    print("NaN values found in 'Class' column of dev dataset. Removing rows with NaN labels.")
    dev_df.dropna(subset=['Class'], inplace=True)

# Prepare the data for TF-IDF vectorization
X_train = train_df['Text']
y_train = train_df['Class']
X_dev = dev_df['Text']
y_dev = dev_df['Class']

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_tfidf = tfidf_vectorizer.transform(X_dev)
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text'])

# Train Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Evaluate on development/validation data
dev_predictions = rf_model.predict(X_dev_tfidf)

dev_accuracy = accuracy_score(y_dev, dev_predictions)
print(f"Development Set Accuracy: {dev_accuracy:.4f}")

print("Development Set Classification Report:")
print(classification_report(y_dev, dev_predictions, target_names=['Non-Abusive', 'Abusive']))

# Predict on test data
test_predictions = rf_model.predict(X_test_tfidf)

# Save predictions to the test data
test_df['predicted_class'] = test_predictions
test_df['predicted_class'] = test_df['predicted_class'].map({0: 'Non-Abusive', 1: 'Abusive'})

# Save the test predictions to a CSV file
output_file_path = 'C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab\\ATRFC001_Predictions.csv'
test_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to '{output_file_path}'.")


NaN values found in 'Class' column of train dataset. Removing rows with NaN labels.
Development Set Accuracy: 0.6672
Development Set Classification Report:
              precision    recall  f1-score   support

 Non-Abusive       0.69      0.68      0.68       320
     Abusive       0.64      0.66      0.65       278

    accuracy                           0.67       598
   macro avg       0.67      0.67      0.67       598
weighted avg       0.67      0.67      0.67       598

Predictions saved to 'C:\Users\DHARANI SINDHU\OneDrive\Documents\Codelab\ATRFC001_Predictions.csv'.
