# Problem 1: Bag-of-Words Feature Representation
In this notebook, we will explore the concept of the Bag-of-Words (BoW) representation for text data and its two popular variations:


In [85]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import pandas as pd

In [86]:
x_train_df = pd.read_csv('../data_reviews/x_train.csv')
y_train_df = pd.read_csv('../data_reviews/y_train.csv')
tr_list_of_sentences = x_train_df['text'].values.tolist()
y = y_train_df['is_positive_sentiment'].values


# Check the lengths
print(f'Length of tr_list_of_sentences: {len(tr_list_of_sentences)}')
print(f'Length of y_train: {len(y)}')

Length of tr_list_of_sentences: 2400
Length of y_train: 2400


In [67]:
# Split the data into training and validation sets
# X_train, X_test, y_train, y_test = train_test_split(
#     tr_list_of_sentences, 
#     y, 
#     test_size=0.25,  # You can adjust the size as needed (e.g., 0.2 for 20%)
#     random_state=42  # Seed for reproducibility
# )

# # Check the resulting lengths to ensure correct splitting
# print(f'Length of X_train: {len(X_train)}')
# print(f'Length of X_test: {len(X_test)}')
# print(f'Length of y_train_split: {len(y_train)}')
# print(f'Length of y_test: {len(y_test)}')

Length of X_train: 1800
Length of X_test: 600
Length of y_train_split: 1800
Length of y_test: 600


We want to only include words that have a count greater than 10 to filter out rare words and misspelled

## Unigram vs Bigram with CountVectorizer

When dealing with text data, the terms "unigram", "bigram", "trigram", and so on refer to a set of consecutive words or tokens taken as a unit. Specifically:
- **Unigram**: Single words. E.g., "sky", "blue"
- **Bigram**: Two contiguous words. E.g., "sky is", "is blue"
- **Trigram**: Three contiguous words. E.g., "The sky is"

Let's see how `CountVectorizer` can be used to extract unigrams and bigrams from our example sentences.

In [91]:
# Creating the pipeline
vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words='english', min_df=5)

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('tfidf', TfidfTransformer(smooth_idf=True, use_idf=True)),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [92]:
param_grid = {
    'classifier__solver': ['lbfgs', 'saga'],
    'vectorizer__max_features': [100, 500, 1000],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'elasticnet']
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='roc_auc'
)

X_train = x_train_df['text']
y_train = y_train_df.values.ravel()

grid_search.fit(X_train, y_train)

225 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/manuelpena/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/manuelpena/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/manuelpena/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  F

In [93]:
y_pred = grid_search.best_estimator_.predict(X_train)

print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

y_pred_prob = grid_search.best_estimator_.predict_proba(X_train)[:, 1]

auc = roc_auc_score(y_train, y_pred_prob)

print(f'AUC: {auc:.4f}')

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1200
           1       0.88      0.83      0.85      1200

    accuracy                           0.86      2400
   macro avg       0.86      0.86      0.86      2400
weighted avg       0.86      0.86      0.86      2400

[[1067  133]
 [ 207  993]]
AUC: 0.9301


In [70]:
# y_pred = grid_search.best_estimator_.predict(X_test)

# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test , y_pred))

# y_pred_prob = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# auc = roc_auc_score(y_test, y_pred_prob)

# print(f'AUC: {auc:.4f}')

              precision    recall  f1-score   support

           0       0.66      0.87      0.75       290
           1       0.83      0.59      0.69       310

    accuracy                           0.72       600
   macro avg       0.75      0.73      0.72       600
weighted avg       0.75      0.72      0.72       600

[[252  38]
 [128 182]]
AUC: 0.8202


In [94]:
# Load the test data
x_test_df = pd.read_csv('../data_reviews/x_test.csv')

# Get the predicted probabilities for the positive class
y_test_pred_prob = grid_search.best_estimator_.predict_proba(x_test_df['text'])[:, 1]

# Save the probabilities to a plain-text file
with open('../data_reviews/yproba1_test.txt', 'w') as f:
    for prob in y_test_pred_prob:
        f.write(f"{prob:.6f}\n")  # Formatting to six decimal places

print("Probabilistic predictions saved to '../data_reviews/yproba1_test.txt'.")

Probabilistic predictions saved to '../data_reviews/yproba1_test.txt'.
