In [37]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [38]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [39]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [41]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [42]:
df.drop_duplicates()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [43]:
# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    return text

df['processed_text'] = df['review'].apply(preprocess_text)
print(df[['review', 'processed_text']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      processed_text  
0  one reviewer mentioned watching 1 oz episode y...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


In [45]:
# Encode labels
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,processed_text,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1


In [48]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(
    df['processed_text'], df['sentiment_encoded'], test_size=0.3, random_state=42, stratify=df['sentiment']
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# Save splits
train_df = pd.DataFrame({'processed_text': X_train, 'sentiment_encoded': y_train})
val_df = pd.DataFrame({'processed_text': X_val, 'sentiment_encoded': y_val})
test_df = pd.DataFrame({'processed_text': X_test, 'sentiment_encoded': y_test})

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

print("\nData splits saved as 'train_data.csv', 'val_data.csv', and 'test_data.csv'")


Training set size: 35000
Validation set size: 7500
Test set size: 7500

Data splits saved as 'train_data.csv', 'val_data.csv', and 'test_data.csv'


In [49]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Set random seed for reproducibility
torch.manual_seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Load data splits
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')
test_df = pd.read_csv('test_data.csv')

# Tokenize data
def tokenize_data(texts, labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values, dtype=torch.long)
    
    return input_ids, attention_masks, labels

# Tokenize datasets
train_inputs, train_masks, train_labels = tokenize_data(train_df['processed_text'], train_df['sentiment_encoded'], tokenizer)
val_inputs, val_masks, val_labels = tokenize_data(val_df['processed_text'], val_df['sentiment_encoded'], tokenizer)
test_inputs, test_masks, test_labels = tokenize_data(test_df['processed_text'], test_df['sentiment_encoded'], tokenizer)

# Create DataLoaders
batch_size = 16  # Default, will be tuned in training
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Save model and tokenizer
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')

print("Model and tokenizer saved in 'sentiment_model' directory")

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved in 'sentiment_model' directory


In [54]:
import pandas as pd
import torch
import numpy as np
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import matplotlib.pyplot as plt

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('sentiment_model')

# Load data splits
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')

# Tokenize data
def tokenize_data(texts, labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values, dtype=torch.long)
    
    return input_ids, attention_masks, labels

train_inputs, train_masks, train_labels = tokenize_data(train_df['processed_text'], train_df['sentiment_encoded'], tokenizer)
val_inputs, val_masks, val_labels = tokenize_data(val_df['processed_text'], val_df['sentiment_encoded'], tokenizer)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

# Hyperparameter grid
learning_rates = [2e-5, 3e-5, 5e-5]
batch_sizes = [16, 32]
best_val_accuracy = 0
best_hyperparams = {}
best_model_path = 'best_model.pt'

# Training function
def train_model(model, train_dataloader, val_dataloader, lr, epochs=5):
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    early_stopping_patience = 2
    best_val_loss = float('inf')
    early_stopping_counter = 0
    
    train_loss_values = []
    val_loss_values = []
    train_accuracy_values = []
    val_accuracy_values = []
    
    for epoch in range(epochs):
        print(f'\nEpoch {epoch + 1}/{epochs}')
        model.train()
        total_train_loss = 0
        train_predictions, train_true_labels = [], []
        
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            logits = outputs.logits
            train_predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            train_true_labels.extend(b_labels.cpu().numpy())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            if step % 100 == 0 and step != 0:
                print(f"Step {step}/{len(train_dataloader)} | Loss: {loss.item():.4f}")
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        train_loss_values.append(avg_train_loss)
        avg_train_accuracy = sum(pred == true for pred, true in zip(train_predictions, train_true_labels)) / len(train_true_labels)
        train_accuracy_values.append(avg_train_accuracy)
        
        # Validation
        model.eval()
        total_val_loss = 0
        predictions, true_labels = [], []
        
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(b_labels.cpu().numpy())
        
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_loss_values.append(avg_val_loss)
        val_accuracy = sum(pred == true for pred, true in zip(predictions, true_labels)) / len(true_labels)
        val_accuracy_values.append(val_accuracy)
        
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Training Accuracy: {avg_train_accuracy:.4f}")
        print(f"Average Validation Loss: {avg_val_loss:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= early_stopping_patience:
                print("Early stopping triggered.")
                break
    
    return val_accuracy, train_loss_values, val_loss_values, train_accuracy_values, val_accuracy_values

# Hyperparameter tuning loop
for lr in learning_rates:
    for batch_size in batch_sizes:
        print(f"\nTraining with lr={lr}, batch_size={batch_size}")
        
        # Create DataLoaders
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        val_sampler = SequentialSampler(val_data)
        val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
        
        # Load fresh model
        model = DistilBertForSequenceClassification.from_pretrained('sentiment_model')
        model.to(device)
        
        # Train
        val_accuracy, train_loss_values, val_loss_values, train_accuracy_values, val_accuracy_values = train_model(
            model, train_dataloader, val_dataloader, lr
        )
        
        # Save best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_hyperparams = {'lr': lr, 'batch_size': batch_size}
            torch.save(model.state_dict(), best_model_path)
            
            # Visualize training progress
            plt.figure(figsize=(12, 5))
            plt.subplot(1, 2, 1)
            plt.plot(train_loss_values, label='Training Loss')
            plt.plot(val_loss_values, label='Validation Loss')
            plt.title('Training and Validation Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
            plt.legend()
            
            plt.subplot(1, 2, 2)
            plt.plot(train_accuracy_values, label='Training Accuracy')
            plt.plot(val_accuracy_values, label='Validation Accuracy')
            plt.title('Training and Validation Accuracy')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.legend()
            plt.savefig(f'loss_accuracy_lr{lr}_bs{batch_size}.png')
            plt.close()

print(f"\nBest Hyperparameters: {best_hyperparams}")
print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")
print("Training complete. Best model saved as 'best_model.pt'. Loss and accuracy curves saved.")

Using device: cuda

Training with lr=2e-05, batch_size=16

Epoch 1/5
Step 100/2188 | Loss: 0.4279
Step 200/2188 | Loss: 0.4273
Step 300/2188 | Loss: 0.3307
Step 400/2188 | Loss: 0.3803
Step 500/2188 | Loss: 0.1097
Step 600/2188 | Loss: 0.3935
Step 700/2188 | Loss: 0.0867
Step 800/2188 | Loss: 0.2272
Step 900/2188 | Loss: 0.1457
Step 1000/2188 | Loss: 0.4021
Step 1100/2188 | Loss: 0.6475
Step 1200/2188 | Loss: 0.3837
Step 1300/2188 | Loss: 0.2242
Step 1400/2188 | Loss: 0.3552
Step 1500/2188 | Loss: 0.0415
Step 1600/2188 | Loss: 0.3451
Step 1700/2188 | Loss: 0.1750
Step 1800/2188 | Loss: 0.1120
Step 1900/2188 | Loss: 0.4471
Step 2000/2188 | Loss: 0.2078
Step 2100/2188 | Loss: 0.3012
Average Training Loss: 0.3504
Average Training Accuracy: 0.8463
Average Validation Loss: 0.2754
Validation Accuracy: 0.8839

Epoch 2/5
Step 100/2188 | Loss: 0.0130
Step 200/2188 | Loss: 0.1277
Step 300/2188 | Loss: 0.4172
Step 400/2188 | Loss: 0.0777
Step 500/2188 | Loss: 0.4621
Step 600/2188 | Loss: 0.3629
S

In [56]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

import pickle
import time

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('sentiment_model')
model.load_state_dict(torch.load('best_model.pt'))
model.to(device)
model.eval()
tokenizer = DistilBertTokenizer.from_pretrained('sentiment_model')

# Load label encoder
with open('label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Load test data
test_df = pd.read_csv('test_data.csv')

# Tokenize test data
def tokenize_data(texts, labels, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values, dtype=torch.long)
    
    return input_ids, attention_masks, labels

test_inputs, test_masks, test_labels = tokenize_data(test_df['processed_text'], test_df['sentiment_encoded'], tokenizer)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
batch_size = 16
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Evaluate on test set
test_predictions, test_true_labels = [], []
inference_times = []

for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch]
    start_time = time.time()
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    inference_times.append(time.time() - start_time)
    logits = outputs.logits
    test_predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
    test_true_labels.extend(b_labels.cpu().numpy())

# Calculate metrics
test_accuracy = accuracy_score(test_true_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_true_labels, test_predictions, average='weighted')

avg_inference_time = np.mean(inference_times)

print("\nTest Set Evaluation:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Average Inference Time: {avg_inference_time:.4f} seconds")

# Confusion Matrix
cm = confusion_matrix(test_true_labels, test_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.png')
plt.close()

# Save evaluation metrics
metrics = {
    'accuracy': test_accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'avg_inference_time': avg_inference_time
}
with open('evaluation_metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)

print("Evaluation complete. Confusion matrix saved as 'confusion_matrix.png'. Metrics saved as 'evaluation_metrics.pkl'")

Using device: cuda


  model.load_state_dict(torch.load('best_model.pt'))



Test Set Evaluation:
Accuracy: 0.8983
Precision: 0.8983
Recall: 0.8983
F1-Score: 0.8983
Average Inference Time: 0.0042 seconds
Evaluation complete. Confusion matrix saved as 'confusion_matrix.png'. Metrics saved as 'evaluation_metrics.pkl'


In [57]:
def predict_sentiment(text, model, tokenizer):
    # Preprocess the text
    processed_text = preprocess_text(text)
    
    # Tokenize
    encoded_dict = tokenizer.encode_plus(
        processed_text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    # Move to device
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    
    # Get prediction class
    prediction = torch.argmax(logits, dim=1).item()
    
    # Map prediction to sentiment using label encoder
    sentiment = label_encoder.inverse_transform([prediction])[0]
    
    return sentiment

# Example usage
print("\nSample Predictions:")
sample_texts = [
    "I absolutely love this product! It's amazing!",
    "This is the worst experience I've ever had.",
    "The movie was fantastic, with great acting and a compelling story.",
    "I was really disappointed with the service at this restaurant."
]

model = DistilBertForSequenceClassification.from_pretrained('sentiment_model')
model.load_state_dict(torch.load('best_model.pt'))
model.to(device)
tokenizer = DistilBertTokenizer.from_pretrained('sentiment_model')
for text in sample_texts:
    sentiment = predict_sentiment(text, model, tokenizer)
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}\n")


Sample Predictions:


  model.load_state_dict(torch.load('best_model.pt'))


Text: I absolutely love this product! It's amazing!
Sentiment: positive

Text: This is the worst experience I've ever had.
Sentiment: negative

Text: The movie was fantastic, with great acting and a compelling story.
Sentiment: positive

Text: I was really disappointed with the service at this restaurant.
Sentiment: negative

