In [3]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import numpy as np

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('training_data_lowercase.csv', sep='\t', names=['label', 'text'])

# Clean and preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    stop_words = set(nltk.corpus.stopwords.words('english'))  # Assuming English text
    return ' '.join([word for word in text.split() if word not in stop_words])

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Tokenize text for Word2Vec
sentences = [nltk.word_tokenize(text) for text in df['cleaned_text']]

# Train a Word2Vec model (Skip-gram)
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Function to create feature vectors by averaging word embeddings
def vectorize_text(text, model):
    words = nltk.word_tokenize(text)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Average the word vectors
    else:
        return np.zeros(100)  # Return a zero vector if no word in the sentence is in the model

# Apply Word2Vec to the dataset
X = np.array([vectorize_text(text, w2v_model) for text in df['cleaned_text']])

# Extract labels
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model 1: Support Vector Machine (SVM) ###
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

### Model 2: Logistic Regression ###
logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
y_pred_logreg = logreg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

### Model 3: RandomForestClassifier ###
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

### Model 4: XGBoost ###
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

""" # Optional: Save models for future use
import pickle
with open('svm_model_w2v.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

with open('logreg_model_w2v.pkl', 'wb') as f:
    pickle.dump(logreg_model, f)

with open('rf_model_w2v.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('xgb_model_w2v.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
 """


[nltk_data] Downloading package punkt to C:\Users\mktmi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mktmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


SVM Accuracy: 0.8047138047138047
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      3529
           1       0.85      0.72      0.78      3302

    accuracy                           0.80      6831
   macro avg       0.81      0.80      0.80      6831
weighted avg       0.81      0.80      0.80      6831

Logistic Regression Accuracy: 0.8003220611916264
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      3529
           1       0.83      0.74      0.78      3302

    accuracy                           0.80      6831
   macro avg       0.80      0.80      0.80      6831
weighted avg       0.80      0.80      0.80      6831

Random Forest Accuracy: 0.8322353974527887
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      3529
     