In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC  # Import Linear Support Vector Classification (SVM)
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords data
# Uncomment the following lines if you haven't downloaded NLTK data
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

# Load dataset
df = pd.read_csv('./dataset.tsv', delimiter='\t')

# Drop rows with missing values
df = df.dropna(subset=['text', 'label'])

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Data preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra whitespaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenization and removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    text = ' '.join(tokens)
    
    return text

df['text'] = df['text'].apply(preprocess_text)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Features (X) - 'text', Labels (y) - 'label'
X = df['text']
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Linear Support Vector Machine (SVM) model
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# Predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report for precision, recall, and F1-score
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       244
           1       0.97      0.94      0.96       231

    accuracy                           0.96       475
   macro avg       0.96      0.96      0.96       475
weighted avg       0.96      0.96      0.96       475



