**Readme File**

The project title - "Weakly Supervised Sentiment Analysis for IMDb Movie Reviews: Logistic Regression and Pseudo-Labeling", includes source code and this readme file that provides instructions
on reproducing the experiments and generating plots. The source code contains the implementation of the sentiment analysis model for IMDb movie reviews, leveraging weak supervision, logistic regression, and pseudo-labeling.

The readme file outlines is very simple to run in jupyter notebook (I run in Google Colab) in few steps and to set up the environment, preprocess the data, train the model, and evaluate its performance. It also provides guidance on generating relevant plots to visualize the results. The project aims to facilitate easy replication of the experiments and enable users to understand and utilize the sentiment analysis model effectively.

**Download and Extract the Dataset**

In [None]:
import os
import tarfile

dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
download_path = "aclImdb_v1.tar.gz"
extract_path = "/content/aclImdb"

# Download the dataset
os.system(f"wget {dataset_url} -O {download_path}")

# Extract the dataset
with tarfile.open(download_path, "r:gz") as tar:
    tar.extractall()

**Install Packages**

In [None]:
!pip install numpy pandas scikit-learn nltk
!pip install matplotlib scikit-learn
%config NotebookApp.iopub_data_rate_limit = 10000000

**Import Packages**

In [None]:
import nltk
import numpy as np
import os
import pickle
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.sparse import vstack
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.metrics import roc_curve, roc_auc_score
nltk.download('punkt')

**Load Dataset**

In [None]:
def load_data(directory):
    data = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r') as file:
                text = file.read()
                data.append(text)
    return data

In [None]:
# Load training data
train_neg_data = load_data('/content/aclImdb/train/neg')
train_pos_data = load_data('/content/aclImdb/train/pos')
uns_data = load_data('/content/aclImdb/train/unsup')
# Load test data
test_neg_data = load_data('/content/aclImdb/test/neg')
test_pos_data = load_data('/content/aclImdb/test/pos')

**Preprocess the Data**

In [None]:
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    processed_text = ' '.join(tokens)
    return processed_text

In [None]:
# Preprocess training data
train_neg_data = [preprocess(text) for text in train_neg_data]
train_pos_data = [preprocess(text) for text in train_pos_data]
uns_data = [preprocess(text) for text in uns_data]

# Preprocess test data
test_neg_data = [preprocess(text) for text in test_neg_data]
test_pos_data = [preprocess(text) for text in test_pos_data]

In [None]:
train_data = train_neg_data + train_pos_data
test_data = test_neg_data + test_pos_data

# Labels for training and testing data
train_labels = [0] * len(train_neg_data) + [1] * len(train_pos_data)
test_labels = [0] * len(test_neg_data) + [1] * len(test_pos_data)

In [None]:
num_train_neg_data = len(train_neg_data)
num_train_pos_data = len(train_pos_data)
num_test_neg_data = len(test_neg_data)
num_test_pos_data = len(test_pos_data)
num_uns_data = len(uns_data)

print(num_train_neg_data)
print(num_train_pos_data)
print(num_test_neg_data)
print(num_test_pos_data)
print(num_uns_data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data labels
labels = ['Train', 'Test', 'Unlabeled']

# Data counts
train_neg_count = num_train_neg_data
train_pos_count = num_train_pos_data
test_neg_count = num_test_neg_data
test_pos_count = num_test_pos_data
uns_count = num_uns_data

# Custom colors for each dataset
colors = ['#FF69B4', '#8A2BE2', '#FFA500']

# Plotting the stacked bar chart with custom colors
x = np.arange(len(labels))

fig, ax = plt.subplots()
ax.bar(x, [train_pos_count, test_pos_count, 0], color=colors[1], label='Positive')
ax.bar(x, [train_neg_count, test_neg_count, 0], color=colors[0], label='Negative', bottom=[train_pos_count, test_pos_count, 0])
ax.bar(x, [0, 0, uns_count], color=colors[2], label='Unlabeled')

# Add labels, title, and legend
ax.set_xlabel('Dataset')
ax.set_ylabel('Count')
ax.set_title('Number of Data Points')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data labels
labels = ['Train Negative', 'Train Positive', 'Test Negative', 'Test Positive', 'Unsupervised']

# Data counts
train_counts = [num_train_neg_data, num_train_pos_data, 0, 0, 0]
test_counts = [0, 0, num_test_neg_data, num_test_pos_data, 0]
uns_counts = [0, 0, 0, 0, num_uns_data]

# Plotting the line plot
plt.plot(labels, train_counts, label='Train')
plt.plot(labels, test_counts, label='Test')
plt.plot(labels, uns_counts, label='Unsupervised')

plt.xlabel('Dataset')
plt.ylabel('Count')
plt.title('Number of Data Points')
plt.legend()
plt.show()

**Bag-of-Words**

In [None]:
# Vectorize the data
vectorizer = CountVectorizer()
vectorizer.fit(train_data + uns_data + test_data)

train_features = vectorizer.transform(train_data)
uns_features = vectorizer.transform(uns_data)
test_features = vectorizer.transform(test_data)

**NB Model**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Train initial models on labeled data
nb_model = MultinomialNB().fit(train_features, train_labels)
# Predict on unlabeled data
threshold = 0.9  # Confidence threshold for pseudo-labeling

nb_probs = nb_model.predict_proba(uns_features)
nb_pseudo_labels = [np.argmax(prob) if max(prob) > threshold else -1 for prob in nb_probs]

# Filter and prepare for retraining
nb_pseudo_features = uns_features[np.array(nb_pseudo_labels) != -1]
nb_pseudo_labels = np.array(nb_pseudo_labels)[np.array(nb_pseudo_labels) != -1]

# Combine pseudo-labeled data with original training data
combined_nb_features = vstack([train_features, nb_pseudo_features])
combined_nb_labels = np.concatenate([train_labels, nb_pseudo_labels])

# Retrain models
nb_model_final = MultinomialNB().fit(combined_nb_features, combined_nb_labels)

# Evaluate models
nb_predictions = nb_model_final.predict(test_features)
nb_accuracy = accuracy_score(test_labels, nb_predictions)
nb_conf_matrix = confusion_matrix(test_labels, nb_predictions)

print(f"Naive Bayes Accuracy: {nb_accuracy}")
print("\n")
print("Naive Bayes Confusion Matrix:")
print(nb_conf_matrix)
print("\n")

nb_precision = precision_score(test_labels, nb_predictions, average='macro')
nb_recall = recall_score(test_labels, nb_predictions, average='macro')
nb_f1 = f1_score(test_labels, nb_predictions, average='macro')
print("NB Precision:",nb_precision)
print("NB Recall:",nb_recall)
print("NB F1:",nb_f1)

# Visualize the Confusion Matrix
import seaborn as sns
plt.figure(figsize=(5, 3))
sns.heatmap(nb_conf_matrix, annot=True, fmt='d', cmap='Oranges')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
nb_classification_report = classification_report(test_labels, nb_predictions)

print(f"Naive Bayes Accuracy: {nb_accuracy}")
print("\n")
print("NB Classification Report:")
print(nb_classification_report)

**LR Model**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Train initial models on labeled data
lr_model = LogisticRegression(max_iter=1000).fit(train_features, train_labels)

# Predict on unlabeled data
threshold = 0.9  # Confidence threshold for pseudo-labeling
lr_probs = lr_model.predict_proba(uns_features)
lr_pseudo_labels = [np.argmax(prob) if max(prob) > threshold else -1 for prob in lr_probs]

# Filter and prepare for retraining
lr_pseudo_features = uns_features[np.array(lr_pseudo_labels) != -1]
lr_pseudo_labels = np.array(lr_pseudo_labels)[np.array(lr_pseudo_labels) != -1]

# Combine pseudo-labeled data with original training data
combined_lr_features = vstack([train_features, lr_pseudo_features])
combined_lr_labels = np.concatenate([train_labels, lr_pseudo_labels])

# Retrain models
lr_model_final = LogisticRegression(max_iter=1000).fit(combined_lr_features, combined_lr_labels)

# Evaluate models
lr_predictions = lr_model_final.predict(test_features)
lr_accuracy = accuracy_score(test_labels, lr_predictions)
lr_conf_matrix = confusion_matrix(test_labels, lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("\n")

lr_precision = precision_score(test_labels, lr_predictions, average='macro')
lr_recall = recall_score(test_labels, lr_predictions, average='macro')
lr_f1 = f1_score(test_labels, lr_predictions, average='macro')
print("LR Precision:",nb_precision)
print("LR Recall:",nb_recall)
print("LR F1:",nb_f1)

print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)
print("\n")

# Visualize the Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(lr_conf_matrix, annot=True, fmt='d', cmap='YlGn')
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
lr_classification_report = classification_report(test_labels, lr_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("\n")
print("LR Classification Report:")
print(lr_classification_report)

**Compare Models**

In [None]:
# Compare Results
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
nb_scores = [nb_accuracy, nb_precision, nb_recall, nb_f1]
lr_scores = [lr_accuracy, lr_precision, lr_recall, lr_f1]

plt.figure(figsize=(6, 4))
x = range(len(metrics))
plt.bar(x, nb_scores, width=0.2, label='NB')
plt.bar([i + 0.2 for i in x], lr_scores, width=0.2, label='LR')
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.xticks([i for i in x], metrics)
plt.title('Comparison of Sentiment Analysis Approaches')
plt.legend()
plt.show()

from sklearn.metrics import roc_curve, roc_auc_score
lr_probs = lr_model_final.predict_proba(test_features)[:, 1]
lr_fpr, lr_tpr, _ = roc_curve(test_labels, lr_probs)
lr_auc = roc_auc_score(test_labels, lr_probs)

nb_probs = nb_model_final.predict_proba(test_features)[:, 1]
nb_fpr, nb_tpr, _ = roc_curve(test_labels, nb_probs)
nb_auc = roc_auc_score(test_labels, nb_probs)

plt.plot(lr_fpr, lr_tpr, label='LR (AUC = {:.2f})'.format(lr_auc))
plt.plot(nb_fpr, nb_tpr, label='NB (AUC = {:.2f})'.format(nb_auc))
plt.plot([0, 1], [0, 1], linestyle='--', color='blue')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - LR vs NB')
plt.legend()
plt.show()

labels = ['Naive Bayes', 'Logistic Regression']
accuracies = [nb_accuracy, lr_accuracy]
colors = ['#66C2A5', '#FC8D62']
plt.pie(accuracies, labels=labels, autopct='%1.1f%%', colors=colors)
plt.title('Accuracy Comparison: NB vs LR')
plt.show()
print("\n")