In [1]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
train_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWM_train.csv')  # Update with your file path
dev_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWM_dev.csv')  # Update with your file path
test_df = pd.read_csv('C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab-docs-1\\AWM_test_without_labels.csv')  # Update with your file path

# Preprocess training and development data
train_df.dropna(subset=['Text', 'Class'], inplace=True)  # Remove rows with missing values in Text or Class columns
dev_df.dropna(subset=['Text', 'Class'], inplace=True)    # Same for dev dataset

# Ensure labels ('Class') are either 1 for 'Abusive' or 0 for 'Non-Abusive'
train_df['Class'] = train_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})
dev_df['Class'] = dev_df['Class'].map({'Abusive': 1, 'Non-Abusive': 0})

# Remove any rows where the target variable 'Class' is NaN
train_df.dropna(subset=['Class'], inplace=True)
dev_df.dropna(subset=['Class'], inplace=True)

# Prepare the data for TF-IDF vectorization
X_train = train_df['Text']
y_train = train_df['Class']
X_dev = dev_df['Text']
y_dev = dev_df['Class']

# Ensure y_train and y_dev do not contain NaN values
if y_train.isnull().sum() > 0:
    print("NaN values found in 'y_train'. Removing rows with NaN labels.")
    train_df = train_df.dropna(subset=['Class'])
    X_train = train_df['Text']
    y_train = train_df['Class']

if y_dev.isnull().sum() > 0:
    print("NaN values found in 'y_dev'. Removing rows with NaN labels.")
    dev_df = dev_df.dropna(subset=['Class'])
    X_dev = dev_df['Text']
    y_dev = dev_df['Class']

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_dev_tfidf = tfidf_vectorizer.transform(X_dev)
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text'])

# Train Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', random_state=42)  # You can also try other kernels (e.g., 'rbf', 'poly')
svm_model.fit(X_train_tfidf, y_train)

# Evaluate on development/validation data
dev_predictions = svm_model.predict(X_dev_tfidf)

dev_accuracy = accuracy_score(y_dev, dev_predictions)
print(f"Development Set Accuracy: {dev_accuracy:.4f}")

print("Development Set Classification Report:")
print(classification_report(y_dev, dev_predictions, target_names=['Non-Abusive', 'Abusive']))

# Predict on test data
test_predictions = svm_model.predict(X_test_tfidf)

# Save predictions to the test data
test_df['predicted_class'] = test_predictions
test_df['predicted_class'] = test_df['predicted_class'].map({0: 'Non-Abusive', 1: 'Abusive'})

# Save the test predictions to a CSV file
output_file_path = 'C:\\Users\\DHARANI SINDHU\\OneDrive\\Documents\\Codelab\\AMSVM001.csv'  # Update with your desired file path
test_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to '{output_file_path}'.")


Development Set Accuracy: 0.6343
Development Set Classification Report:
              precision    recall  f1-score   support

 Non-Abusive       0.64      0.66      0.65       326
     Abusive       0.62      0.61      0.62       303

    accuracy                           0.63       629
   macro avg       0.63      0.63      0.63       629
weighted avg       0.63      0.63      0.63       629

Predictions saved to 'C:\Users\DHARANI SINDHU\OneDrive\Documents\Codelab\AMSVM001.csv'.
