In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
import mlflow
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from flask import Flask, request, jsonify
import os
import time
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Let's see if we can use GPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data function
def load_data(file_path, sample_size=None):
    df = pd.read_csv(file_path)
    if sample_size:
        return df.sample(n=sample_size, random_state=42)
    return df

# Tokenize function
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Train function
def train_model(model, train_dataloader, val_dataloader, epochs=3, lr=2e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            # Update progress bar
            progress_bar.set_postfix({"loss": loss.item()})
        
        avg_train_loss = train_loss / len(train_dataloader)
        val_accuracy = evaluate_model(model, val_dataloader)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Avg training loss: {avg_train_loss*100:.2f}%")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        
        # Log metrics with MLflow
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
    
    return model

# Evaluate function
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    
    return accuracy_score(actual_labels, predictions)

# Monitor performance function
def monitor_performance(current_accuracy, threshold=0.85):
    if current_accuracy < threshold:
        send_email_alert(f"Model performance has degraded. Current accuracy: {current_accuracy * 100:.2f}%")
        return True
    return False

# Email alert function
def send_email_alert(message):
    # Note to self: Remember to use environment variables for these!
    sender_email = "rupeshs2103@gmail.com"
    receiver_email = "rupesh2103033@gmail.com"
    password = "bqviuuaefhfmrycc"
    
    msg = MIMEMultipart()
    msg['From'] = sender_email
    msg['To'] = receiver_email
    msg['Subject'] = "Model Performance Alert"
    
    msg.attach(MIMEText(message, 'plain'))
    
    try:
        with smtplib.SMTP('smtp.gmail.com', 587) as server:
            server.starttls()
            server.login(sender_email, password)
            server.send_message(msg)
        print("Email alert sent!")
    except Exception as e:
        print(f"Oops! Couldn't send email: {e}")

# Retrain function
def retrain_model(model, train_dataloader, val_dataloader, epochs=1):
    print("Time to retrain this bad boy...")
    mlflow.end_run()
    with mlflow.start_run():
        model = train_model(model, train_dataloader, val_dataloader, epochs=epochs)
        retrained_accuracy = evaluate_model(model, val_dataloader)
        print("Retrained model accuracy:", retrained_accuracy*100,"%")
        mlflow.log_metric("retrained_accuracy", retrained_accuracy)
        mlflow.pytorch.log_model(model, "retrained_model")
        torch.save(model.state_dict(), "text_classification_model_retrained.pth")
    return retrained_accuracy

# Batch inference function
def batch_inference(input_file, output_file, batch_size=32):
    df = pd.read_csv(input_file)
    texts = df['review'].tolist()
    predictions = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        probabilities = torch.softmax(outputs.logits, dim=1)
        batch_predictions = torch.argmax(probabilities, dim=1).tolist()
        predictions.extend(batch_predictions)
    
    df['prediction'] = ['positive' if p == 1 else 'negative' for p in predictions]
    df.to_csv(output_file, index=False)
    print(f"Batch predictions saved to {output_file}")

# Main execution
if __name__ == "__main__":
    # Start MLflow run
    with mlflow.start_run():
        # Load and preprocess data
        print("Loading data...")
        data = load_data("C:\\Users\\rupes\\Desktop\\Problem_1_Rupesh\\Data\\IMDB Dataset.csv", sample_size=10000)
        texts = data['review'].tolist()
        labels = (data['sentiment'] == 'positive').astype(int).tolist()
        
        # Split the data
        train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
        
        # Initialize tokenizer and model
        print("Initializing model...")
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
        
        # Tokenize and encode the data
        train_encodings = tokenize_data(train_texts, tokenizer)
        val_encodings = tokenize_data(val_texts, tokenizer)
        
        # Create DataLoaders
        train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
        val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels))
        train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=16)
        
        # Train the model
        print("Training model...")
        model = train_model(model, train_dataloader, val_dataloader, epochs=1)
        
        # Evaluate the model
        final_accuracy = evaluate_model(model, val_dataloader)
        print("Final model accuracy:", final_accuracy*100,"%")
        
        # Log final metrics and model
        mlflow.log_metric("final_accuracy", final_accuracy)
        mlflow.pytorch.log_model(model, "model")
        
        # Save the model
        torch.save(model.state_dict(), "text_classification_model.pth")
        
        # Simulate performance degradation
        time.sleep(5)
        print("Simulating performance degradation...")
        degraded_accuracy = 0.80
        if monitor_performance(degraded_accuracy):
            print("Performance drop detected! Retraining...")
            new_accuracy = retrain_model(model, train_dataloader, val_dataloader)
            print("Model retrained. New accuracy:", new_accuracy*100,"%")

Using device: cpu
Loading data...
Initializing model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training model...


Epoch 1/1: 100%|██████████████████████████████████████████████████████████| 500/500 [32:49<00:00,  3.94s/it, loss=0.24]


Epoch 1/1
Avg training loss: 40.10%
Validation accuracy: 86.70%
Final model accuracy: 86.7 %




Simulating performance degradation...
Email alert sent!
Performance drop detected! Retraining...
Time to retrain this bad boy...


Epoch 1/1: 100%|██████████████████████████████████████████████████████████| 500/500 [33:01<00:00,  3.96s/it, loss=0.47]


Epoch 1/1
Avg training loss: 23.62%
Validation accuracy: 87.30%
Retrained model accuracy: 87.3 %




Model retrained. New accuracy: 87.3 %


In [5]:
from flask import Flask, request, jsonify, render_template_string
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

app = Flask(__name__)

# Initialize model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

#HTML template
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sentiment Analysis</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
            margin: 0;
            background: linear-gradient(135deg, #ece9e6, #ffffff);
        }
        .container {
            text-align: center;
            padding: 30px;
            background-color: #ffffff;
            border-radius: 10px;
            box-shadow: 0 10px 20px rgba(0, 0, 0, 0.15);
            max-width: 550px;
            width: 95%;
            transition: transform 0.3s;
        }
        .container:hover {
            transform: translateY(-5px);
        }
        h1 {
            margin-bottom: 20px;
            color: #333;
        }
        textarea {
            width: 100%;
            height: 150px;
            padding: 12px;
            border: 1px solid #ccc;
            border-radius: 5px;
            font-size: 14px;
            resize: none;
            margin-bottom: 15px;
            transition: box-shadow 0.3s;
        }
        textarea:focus {
            box-shadow: 0 0 10px rgba(0, 123, 255, 0.3);
            border-color: #007bff;
        }
        button {
            padding: 12px 25px;
            border: none;
            border-radius: 5px;
            background-color: #007bff;
            color: #ffffff;
            font-size: 16px;
            cursor: pointer;
            transition: background-color 0.3s, box-shadow 0.3s;
        }
        button:hover {
            background-color: #0056b3;
            box-shadow: 0 4px 10px rgba(0, 91, 187, 0.3);
        }
        #result {
            margin-top: 15px;
            font-size: 16px;
            color: #555;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Sentiment Analysis</h1>
        <form id="sentiment-form">
            <textarea id="text-input" placeholder="Enter your text here..."></textarea>
            <br>
            <button type="submit">Analyze Sentiment</button>
        </form>
        <p id="result"></p>
    </div>

    <script>
        document.getElementById('sentiment-form').addEventListener('submit', function(e) {
            e.preventDefault();
            var text = document.getElementById('text-input').value.trim();
            if (!text) {
                document.getElementById('result').textContent = 'Please enter some text.';
                return;
            }
            fetch('/predict', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json',
                },
                body: JSON.stringify({ text: text }),
            })
            .then(response => response.json())
            .then(data => {
                document.getElementById('result').textContent = 'Prediction: ' + data.prediction;
            })
            .catch(error => {
                document.getElementById('result').textContent = 'Error analyzing sentiment.';
                console.error('Error:', error);
            });
        });
    </script>
</body>
</html>
"""


# Route to serve the main page
@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

# Prediction route
@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    text = data.get('text', '')

    # Tokenize and predict
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()

    # Map prediction to sentiment
    sentiment = 'Positive' if prediction == 1 else 'Negative'
    return jsonify({'prediction': sentiment})

if __name__ == '__main__':
    print("Starting Flask app for live inference...")
    app.run(debug=True, use_reloader=False)


Starting Flask app for live inference...
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
