In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [3]:
def compute_bow(documents, max_features):
    """
    Computes the bag-of-words matrix for the given documents.

    Parameters:
    - documents: List of text documents.
    - max_features: Maximum number of features (vocabulary size).

    Returns:
    - bow_matrix: Sparse matrix of shape (n_samples, n_features).
    """
    # TODO: Implement me!
    vectorizer = CountVectorizer(max_features=max_features)
    bow_matrix = vectorizer.fit_transform(documents)
    return bow_matrix

In [4]:
def compute_tfidf(documents, max_features):
    """
    Computes the tfidf matrix for the given documents.

    Parameters:
    - documents: List of text documents.
    - max_features: Maximum number of features to use.

    Returns:
    - tfidf_matrix: Sparse matrix of shape (n_samples, n_features).
    """
    # TODO: Implement me!
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

In [5]:
def apply_lsa(tfidf_matrix, n_components):
    """
    Applies LSA (using TruncatedSVD) to the tfidf matrix.

    Parameters:
    - tfidf_matrix: Sparse matrix from tfidf vectorization.
    - n_components: Number of components to keep.

    Returns:
    - lsa_matrix: Dense matrix with reduced dimensions.
    """
    # TODO: Implement me!
    vectorizer = TruncatedSVD(n_components=n_components)
    lsa_matrix = vectorizer.fit_transform(tfidf_matrix)
    return lsa_matrix

In [6]:
def create_documents(df):
  documents = df['review'].tolist()
  return documents

In [7]:
def create_labels(df):
  labels = df['voted_up'].tolist()
  return labels

In [8]:
max_features = 1000

In [9]:
df = pd.read_csv('cleaned_data.csv')
documents = create_documents(df)
labels = create_labels(df)
bow_matrix = compute_bow(documents, 1000)
print("Bag of words matrix:")
print(bow_matrix)

tfidf_matrix = compute_tfidf(documents, 1000)
print("TF-IDF matrix:")
print(tfidf_matrix)

lsa_matrix = apply_lsa(tfidf_matrix, 100)
print("LSA matrix:")
print(lsa_matrix)

Bag of words matrix:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12240941 stored elements and shape (193174, 1000)>
  Coords	Values
  (1, 659)	1
  (1, 644)	1
  (1, 353)	1
  (1, 613)	1
  (1, 781)	1
  (2, 659)	1
  (2, 103)	1
  (2, 974)	1
  (2, 566)	1
  (2, 51)	1
  (3, 353)	1
  (3, 974)	1
  (3, 51)	3
  (3, 575)	1
  (3, 343)	1
  (3, 968)	1
  (3, 658)	1
  (3, 829)	1
  (3, 737)	1
  (3, 439)	1
  (3, 676)	1
  (3, 728)	1
  (3, 609)	1
  (3, 858)	1
  (3, 458)	1
  :	:
  (193173, 797)	2
  (193173, 321)	1
  (193173, 463)	1
  (193173, 218)	1
  (193173, 827)	1
  (193173, 304)	1
  (193173, 227)	1
  (193173, 311)	1
  (193173, 258)	1
  (193173, 241)	2
  (193173, 186)	1
  (193173, 573)	1
  (193173, 976)	1
  (193173, 717)	1
  (193173, 699)	1
  (193173, 760)	2
  (193173, 733)	1
  (193173, 223)	1
  (193173, 634)	1
  (193173, 745)	1
  (193173, 456)	1
  (193173, 457)	1
  (193173, 257)	2
  (193173, 540)	1
  (193173, 908)	1
TF-IDF matrix:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	w

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB



from xgboost import XGBClassifier



In [11]:
def evaluate_knn(features, labels, n_neighbors=5, test_size=0.2):
    """
    Splits the dataset, trains a kNN classifier, and evaluates performance.

    Parameters:
    - features: Feature matrix (can be tfidf or LSA-transformed).
    - labels: Ground truth labels.
    - n_neighbors: Number of neighbors for kNN.
    - test_size: Fraction of data to use for testing.

    Returns:
    - accuracy: Accuracy score on the test set.
    """
    trainX, testX, trainY, testY = train_test_split(features, labels, test_size=test_size)

    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    knn.fit(trainX, trainY)

    predict = knn.predict(testX)
    accuracy = accuracy_score(testY, predict)
    print("Accuracy:", accuracy)
    print(classification_report(testY, predict))

    return accuracy, knn  # we return the trained model

In [12]:
def evaluate_decision_tree(X, y, test_size=0.2, max_depth=70):
    """
    Splits the dataset, trains a Decision Tree classifier,
    and evaluates performance.

    Parameters:
    - X: Feature matrix.
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.
    - max_depth: Maximum depth of the decision tree.


    Returns:
    - accuracy: Accuracy score on the test set.
    - dt_model: The trained Decision Tree model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate Model 
    dt_model = DecisionTreeClassifier(
        max_depth=max_depth,
        criterion='gini', 
        min_samples_split=2, 
        min_samples_leaf=1
    )

    # 3. Train
    dt_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = dt_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Decision Tree (max_depth={max_depth}) Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, dt_model

In [13]:
def evaluate_naive_bayes(X, y, test_size=0.2):
    """
    Splits the dataset, trains a Naive Bayes classifier (MultinomialNB),
    and evaluates performance.

    Parameters:
    - X: Feature matrix (may not allow LSA features because negative values).
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.

    Returns:
    - accuracy: Accuracy score on the test set.
    - nb_model: The trained Naive Bayes model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate Model
    nb_model = MultinomialNB()
    # nb_model = GaussianNB()        # uncomment for lsa tfidf

    # 3. Train
    nb_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = nb_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print("Naive Bayes Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, nb_model

In [14]:
def evaluate_xgboost(X, y, test_size=0.2, n_estimators=300, max_depth=6, learning_rate=0.1, subsample=0.8):
    """
    Splits the dataset, trains an XGBoost classifier,
    and evaluates performance.

    Parameters:
    - X: Feature matrix.
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.
    - n_estimators: Number of boosting rounds.
    - max_depth: Maximum depth of each tree.
    - learning_rate: Reduce overfitting.
    
    Returns:
    - accuracy: Accuracy score on the test set.
    - xgb_model: The trained XGBoost model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate XGB model
    xgb_model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
    )

    # 3. Train
    xgb_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = xgb_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"XGBoost (n_estimators={n_estimators}, max_depth={max_depth}, "
          f"learning_rate={learning_rate}) Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, xgb_model


In [None]:
# Run everything

y = create_labels(df)

bow_matrix = compute_bow(documents, max_features)
X_tfidf = compute_tfidf(documents, max_features)
X_lsa = apply_lsa(X_tfidf, 100)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, labels, test_size=0.2, random_state=42)

accuracy, model = evaluate_knn(X_train, y_train, n_neighbors=5)

# acc_dt, model_dt = evaluate_decision_tree(X_tfidf, y, max_depth=70)

# acc_nb, model_nb = evaluate_naive_bayes(X_tfidf, y)

# acc_xgb, model_xgb = evaluate_xgboost(X_tfidf, y)

NameError: name 'X_tfidf' is not defined