In [None]:
import pandas as pd

# Load training and testing data
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')


In [2]:
#1.2 Preprocess 
# For TF-IDF Features
# clean the text by lowercasing, removing punctuation, and removing stopwords to prepare it for TF-IDF vectorization.
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', punctuation))
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
train_data['cleaned_text'] = train_data['review'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['review'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LiJiaGeng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'train_data' is not defined

In [None]:
# For BERT Embeddings
# tokenize the text using BERT's tokenizer, ensuring all reviews have the same length for model input.
from transformers import BertTokenizer

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['review'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

# Apply tokenization
train_data_tokens = train_data['review'].apply(lambda x: tokenizer.encode_plus(
    x,
    add_special_tokens=True,
    max_length=128,
    truncation=True,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
))

test_data_tokens = test_data['review'].apply(lambda x: tokenizer.encode_plus(
    x,
    add_special_tokens=True,
    max_length=128,
    truncation=True,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
))


In [5]:
# Step 2: Model Selection
# 2.1 Baseline Model
# Logistic Regression with TF-IDF
# convert the cleaned text into TF-IDF vectors and train a Logistic Regression model on these features.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Vectorize the text
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['cleaned_text'])
X_test = vectorizer.transform(test_data['cleaned_text'])
y_train = train_data['sentiment']

# Initialize and train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)


In [None]:
# 2.2 Enhanced Model: Fine-tune BERT
# fine-tune BERT by adding a classification layer on top. The Trainer class handles the training loop, optimization, and evaluation.

from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Prepare dataset for Hugging Face's Trainer
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenize datasets
def preprocess_function(examples):
    return tokenizer(
        examples['review'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sentiment'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    seed=42
)

# Define compute_metrics function
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,  # Ideally, you should have a separate validation set
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
# Step 3: Testing and Prediction
# Logistic Regression
# Predict sentiments
predictions = lr_model.predict(X_test)
test_data['predicted_sentiment'] = predictions
# Fine-tuned BERT
# Predict sentiments
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
test_data['predicted_sentiment'] = predicted_labels


In [None]:
#3.2 Save Results
# Prepare submission
submission = test_data[['id', 'predicted_sentiment']]
submission.to_csv('submission.csv', index=False)


In [None]:
# Step 4: Result Analysis
# 4.1 Evaluate Model Performance
# For Logistic Regression
from sklearn.metrics import classification_report

# Evaluate on training data
train_predictions = lr_model.predict(X_train)
print(classification_report(y_train, train_predictions))
# For Fine-tuned BERT
# Evaluate using Trainer's built-in method
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
# 4.2 Sample Analysis
# By analyzing specific cases, we can gain insights into the model's strengths and weaknesses, such as handling of sarcasm or nuanced language.
# Correct predictions
correct_preds = test_data[test_data['predicted_sentiment'] == test_data['sentiment']].sample(2)
print("Correct Predictions:")
print(correct_preds[['review', 'predicted_sentiment']])

# Incorrect predictions
incorrect_preds = test_data[test_data['predicted_sentiment'] != test_data['sentiment']].sample(2)
print("\nIncorrect Predictions:")
print(incorrect_preds[['review', 'predicted_sentiment']])


Step 5: Feature Format Impact Discussion
TF-IDF Advantages:
Efficiency: Faster computation and lower resource consumption.
Simplicity: Easy to implement and interpret.
BERT Embeddings Advantages:
Contextual Understanding: Captures context and nuances in language.
Accuracy: Generally provides higher classification accuracy.
Trade-offs:
Resource Consumption: BERT requires more computational power and memory.
Processing Time: Longer training and inference times compared to TF-IDF.
Discussion: The choice between TF-IDF and BERT embeddings depends on the specific requirements and resource constraints of your project. While BERT offers superior performance, TF-IDF is a practical choice for scenarios with limited resources.

In [None]:
# Extension for Extra Credit: Emotion and Tone Classification
# Data Expansion and Re-labeling
# Load emotion dataset
emotion_data = pd.read_csv('kaggle_emotion_dataset.csv')  # Ensure this file is available


In [None]:
# Map emotions to our data (this requires careful matching)
# For demonstration, let's assume we can map sentiments to emotions
emotion_mapping = {'positive': 'joy', 'negative': 'anger'}
train_data['emotion'] = train_data['sentiment'].map(emotion_mapping)


In [None]:
# Update tokenizer and model for emotion classification
from transformers import BertTokenizer, BertForSequenceClassification

# Assume num_labels equals the number of emotions in your dataset
num_labels = emotion_data['emotion'].nunique()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Update training arguments if necessary
training_args.num_train_epochs = 4


In [None]:
# Encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data['emotion_label'] = le.fit_transform(train_data['emotion'])

# Prepare datasets
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(preprocess_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'emotion_label'])


In [None]:
# Update compute_metrics for multi-class
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


In [None]:
# Prepare test dataset
test_data['emotion_label'] = le.transform(test_data['emotion'])  # Ensure test data has emotion labels
test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'emotion_label'])

# Predict emotions
predictions = trainer.predict(test_dataset)
predicted_emotions = np.argmax(predictions.predictions, axis=1)
test_data['predicted_emotion'] = le.inverse_transform(predicted_emotions)


In [None]:
# Save emotion predictions
submission_emotion = test_data[['id', 'predicted_emotion']]
submission_emotion.to_csv('submission_emotion.csv', index=False)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_data['emotion_label'], predicted_emotions, target_names=le.classes_))


Extended Report Section
Model Performance: Discuss metrics such as multi-class accuracy and weighted F1-score to evaluate the emotion classification model.

Enrichment of Analysis: Explain how emotion classification provides deeper insights into customer feedback, enabling more targeted strategies.

Resource Demands: Acknowledge that fine-tuning BERT for multiple classes increases computational requirements, but highlight the added value it brings to understanding complex emotional nuances.