In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# NLTK setup
nltk.download('punkt')
nltk.data.path.append(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\nltk_data")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jorda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load data
train_df = pd.read_csv("cleaned_train.csv")
test_df = pd.read_csv("cleaned_test.csv")

# Tokenize train corpus for Word2Vec
tokenized_train = train_df['text_final'].dropna().apply(word_tokenize).tolist()

In [3]:
# Train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Save the model
w2v_model.save("word2vec_model.model")

# Fit TF-IDF on training corpus
train_corpus = train_df['text_final'].fillna("")
tfidf = TfidfVectorizer()
tfidf.fit(train_corpus)
idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

In [4]:
def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)

# Apply to training and test sets
train_features = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in train_corpus])
test_corpus = test_df['text_final'].fillna("")
test_features = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in test_corpus])

In [5]:
np.save("train_features.npy", train_features)
np.save("test_features.npy", test_features)
train_df[['target']].to_csv("train_labels.csv", index=False)

print("Saved train/test features and labels.")

Saved train/test features and labels.


In [None]:
X = train_features
y = train_df['target'].values

# Split into train/validation (80/20) for evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Evaluate
y_pred = lr.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))

Accuracy: 0.7275114904793172

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.92      0.79       869
           1       0.82      0.47      0.60       654

    accuracy                           0.73      1523
   macro avg       0.76      0.70      0.70      1523
weighted avg       0.75      0.73      0.71      1523


Confusion Matrix:
 [[801  68]
 [347 307]]


In [7]:
# Predict on test set (for submission or inspection)
test_preds = lr.predict(test_features)

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_preds)),
    'target': test_preds
})
submission.to_csv("logreg_submission.csv", index=False)
print("Test predictions saved to logreg_submission.csv")

Test predictions saved to logreg_submission.csv
