# Customer Service Sentiment Analysis Project
This notebook implements sentiment analysis on customer service conversations following the assignment requirements:
1. WANDB Setup
2. Data Loading and EDA
3. Data Preprocessing
4. Model Architecture
5. Training and Evaluation

In [3]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import wandb
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 1. WANDB Setup and Data Loading
First, we'll set up WANDB and load our dataset

In [4]:
# Initialize WANDB with API key
import os
os.environ["WANDB_API_KEY"] = "1d83931f78aaf77ced9b8cdbe1046e394d81342f"

# Initialize wandb project
wandb.init(
    project="customer-service-sentiment",
    config={
        "architecture": "nanoGPT",
        "dataset": "customer_service",
        "learning_rate": 2e-5,
        "epochs": 3,
        "batch_size": 8,
        "max_length": 512
    }
)

print("WANDB initialized successfully!")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmmertalper[0m ([33mmmertalper_[0m). Use [1m`wandb login --relogin`[0m to force relogin


WANDB initialized successfully!


In [6]:
!python prepare_customer_service.py

Vocabulary size: 50,261
Train samples: 873
Validation samples: 97
Test samples: 30

Sentiment distribution:
Train: {'neutral': 488, 'negative': 370, 'positive': 15}
Val: {'neutral': 54, 'negative': 41, 'positive': 2}
Test: {'negative': 10, 'neutral': 10, 'positive': 10}


# Cell 5 (Markdown)
## Exploratory Data Analysis (EDA)
Now that we've prepared our data, let's analyze:
1. Distribution of sentiment classes
2. Conversation characteristics
3. Relationships between features

In [None]:
# Load the original data for EDA
train_df = pd.read_csv('data/customer_service/train.csv')
test_df = pd.read_csv('data/customer_service/test.csv')

print("Dataset shapes:")
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}\n")

# Display basic information
print("Training set columns:", train_df.columns.tolist())
print("\nSample of training data:")
display(train_df.head())

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sentiment_counts = train_df['customer_sentiment'].value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Customer Sentiment in Training Set')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Log to wandb
wandb.log({"sentiment_distribution": wandb.Image(plt)})
plt.show()

# Print distribution percentages
print("\nSentiment Distribution:")
print(train_df['customer_sentiment'].value_counts(normalize=True).mul(100).round(2))