# Backdoor AI Model Trainer

This notebook trains Backdoor AI models using Google Colab's resources. It connects to Dropbox to access training data and upload trained models.

## Setup and Configuration

In [None]:
# Install required packages with specific versions for compatibility
!pip install dropbox pandas numpy nltk flask
# Install specific scikit-learn version compatible with coremltools
!pip install scikit-learn==1.5.1
# Install joblib
!pip install joblib
# Install coremltools with specific flags to ensure proper installation
!pip uninstall -y coremltools
!pip install coremltools==6.3 --no-binary coremltools

In [None]:
# Import necessary libraries
import os
import io
import json
import time
import sqlite3
import pandas as pd
import numpy as np
import tempfile
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('backdoor_ai_trainer')

## Dropbox Configuration - Edit these settings if needed

In [None]:
# Dropbox credentials
DROPBOX_APP_KEY = "2bi422xpd3xd962"  # Default from your config
DROPBOX_APP_SECRET = "j3yx0b41qdvfu86"  # Default from your config
DROPBOX_REFRESH_TOKEN = "RvyL03RE5qAAAAAAAAAAAVMVebvE7jDx8Okd0ploMzr85c6txvCRXpJAt30mxrKF"  # Default from your config

# Configuration for folders and files
DROPBOX_DB_FILENAME = "backdoor_ai_db.db"
DROPBOX_MODELS_FOLDER = "backdoor_models"
DROPBOX_BASE_MODEL_FOLDER = "base_model"
DROPBOX_TRIGGERS_FOLDER = "training_triggers"

# Training configuration
MODEL_VERSION_PREFIX = "1.0."
MAX_FEATURES = 5000
NGRAM_RANGE = (1, 2)
BASE_MODEL_WEIGHT = 2.0
USER_MODEL_WEIGHT = 1.0
MIN_TRAINING_DATA = 50
MAX_MODELS_TO_KEEP = 5

# Retraining thresholds
RETRAINING_THRESHOLDS = {
    'pending_models': 3,
    'hours_since_last_training': 12,
    'new_interactions': 100
}

## Connect to Dropbox

In [None]:
# Initialize Dropbox connection
def init_dropbox():
    import dropbox
    import requests
    
    # Get access token using refresh token
    token_url = "https://api.dropboxapi.com/oauth2/token"
    data = {
        "grant_type": "refresh_token",
        "refresh_token": DROPBOX_REFRESH_TOKEN,
        "client_id": DROPBOX_APP_KEY,
        "client_secret": DROPBOX_APP_SECRET
    }
    
    response = requests.post(token_url, data=data)
    if response.status_code == 200:
        token_data = response.json()
        access_token = token_data["access_token"]
        print("Successfully obtained access token")
        
        # Initialize Dropbox client
        dbx = dropbox.Dropbox(
            oauth2_access_token=access_token,
            app_key=DROPBOX_APP_KEY,
            app_secret=DROPBOX_APP_SECRET
        )
        
        # Test connection
        account = dbx.users_get_current_account()
        print(f"Connected to Dropbox as {account.name.display_name}")
        return dbx
    else:
        print(f"Token refresh failed: {response.status_code} - {response.text}")
        return None

# Connect to Dropbox
dbx = init_dropbox()
if dbx is None:
    raise Exception("Failed to connect to Dropbox. Check your credentials.")

## Dropbox File Operations

In [None]:
# Dropbox utility functions
from dropbox.files import WriteMode
from dropbox.exceptions import ApiError

def ensure_folder_exists(path):
    try:
        dbx.files_get_metadata(path)
        return True
    except ApiError as e:
        if e.error.is_path() and e.error.get_path().is_not_found():
            dbx.files_create_folder_v2(path)
            print(f"Created folder: {path}")
            return True
        else:
            print(f"Error checking folder: {e}")
            return False

def download_file(path, local_path=None):
    try:
        if local_path:
            dbx.files_download_to_file(local_path, path)
            return {'success': True, 'path': local_path}
        else:
            metadata, response = dbx.files_download(path)
            content = response.content
            buffer = io.BytesIO(content)
            return {'success': True, 'buffer': buffer, 'size': len(content)}
    except Exception as e:
        print(f"Error downloading file {path}: {e}")
        return {'success': False, 'error': str(e)}

def upload_file(data_or_path, path):
    try:
        if isinstance(data_or_path, str) and os.path.exists(data_or_path):
            with open(data_or_path, 'rb') as f:
                dbx.files_upload(f.read(), path, mode=WriteMode.overwrite)
        elif hasattr(data_or_path, 'read'):
            if hasattr(data_or_path, 'seek'):
                data_or_path.seek(0)
            dbx.files_upload(data_or_path.read(), path, mode=WriteMode.overwrite)
        else:
            dbx.files_upload(data_or_path, path, mode=WriteMode.overwrite)
        return {'success': True, 'path': path}
    except Exception as e:
        print(f"Error uploading to {path}: {e}")
        return {'success': False, 'error': str(e)}

def list_folder(path):
    try:
        result = dbx.files_list_folder(path)
        items = result.entries
        while result.has_more:
            result = dbx.files_list_folder_continue(result.cursor)
            items.extend(result.entries)
        return {'success': True, 'items': items}
    except Exception as e:
        print(f"Error listing folder {path}: {e}")
        return {'success': False, 'error': str(e)}

# Create required folders
for folder in [
    f"/{DROPBOX_MODELS_FOLDER}", 
    f"/{DROPBOX_BASE_MODEL_FOLDER}", 
    f"/{DROPBOX_TRIGGERS_FOLDER}",
    f"/{DROPBOX_MODELS_FOLDER}/trained",
    f"/{DROPBOX_MODELS_FOLDER}/uploaded"
]:
    ensure_folder_exists(folder)

## Download and Prepare Database

In [None]:
# Get the database from Dropbox
def setup_database():
    db_path = f"/{DROPBOX_DB_FILENAME}"
    temp_db_path = os.path.join(tempfile.gettempdir(), DROPBOX_DB_FILENAME)
    
    # Download database file
    result = download_file(db_path, temp_db_path)
    if not result['success']:
        print(f"Failed to download database: {result.get('error')}")
        return None
    
    # Create in-memory database for faster operations
    conn_memory = sqlite3.connect(':memory:')
    conn_disk = sqlite3.connect(temp_db_path)
    conn_disk.backup(conn_memory)
    conn_disk.close()
    
    print(f"Database loaded into memory from {temp_db_path}")
    return conn_memory

# Connect to database
db_conn = setup_database()

## Check Training Criteria

In [None]:
# Determine if training is needed
def should_retrain(conn):
    cursor = conn.cursor()
    
    # Check pending models
    cursor.execute("SELECT COUNT(*) FROM uploaded_models WHERE incorporation_status = 'pending'")
    pending_models_count = cursor.fetchone()[0]
    
    if pending_models_count >= RETRAINING_THRESHOLDS['pending_models']:
        print(f"Retraining triggered: {pending_models_count} pending uploaded models")
        return True
    
    # Check time since last training
    cursor.execute("SELECT MAX(training_date) FROM model_versions")
    last_training = cursor.fetchone()[0]
    
    if last_training:
        last_training_date = datetime.fromisoformat(last_training)
        time_since_training = datetime.now() - last_training_date
        
        if (time_since_training > timedelta(hours=RETRAINING_THRESHOLDS['hours_since_last_training']) 
            and pending_models_count > 0):
            print(f"Retraining triggered by time threshold")
            return True
        
        # Check new interactions
        cursor.execute("SELECT COUNT(*) FROM interactions WHERE created_at > ?", (last_training,))
        new_interactions = cursor.fetchone()[0]
        
        if (new_interactions >= RETRAINING_THRESHOLDS['new_interactions'] 
            and pending_models_count > 0):
            print(f"Retraining triggered by new interactions")
            return True
    
    return False

# Get pending models
def get_pending_uploaded_models(conn):
    cursor = conn.cursor()
    cursor.execute("""
        SELECT id, device_id, file_path, upload_date, app_version, description, file_size, original_filename 
        FROM uploaded_models
        WHERE incorporation_status IN ('pending', 'processing')
        ORDER BY upload_date ASC
    """)
    
    columns = [col[0] for col in cursor.description]
    models = [dict(zip(columns, row)) for row in cursor.fetchall()]
    
    # Download model files from Dropbox
    for model in models:
        if model['file_path'].startswith('dropbox:'):
            path = model['file_path'].split(':', 1)[1]
            try:
                result = download_file(path)
                if result['success']:
                    model['model_buffer'] = result['buffer']
            except Exception as e:
                print(f"Error loading model {model['id']}: {e}")
    
    return models

# Check for training triggers
def check_for_triggers():
    triggers_path = f"/{DROPBOX_TRIGGERS_FOLDER}"
    result = list_folder(triggers_path)
    
    if not result['success']:
        return None
    
    for item in result['items']:
        if hasattr(item, 'name') and item.name.endswith('training_needed.json'):
            trigger_path = item.path_display
            trigger_result = download_file(trigger_path)
            
            if trigger_result['success']:
                buffer = trigger_result['buffer']
                buffer.seek(0)
                try:
                    trigger_data = json.loads(buffer.read().decode('utf-8'))
                    print(f"Found trigger file: {trigger_path}")
                    return {'path': trigger_path, 'data': trigger_data}
                except Exception as e:
                    print(f"Error parsing trigger file: {e}")
    
    return None

# Update trigger status
def update_trigger_status(trigger_path, status, message=None, model_info=None):
    result = download_file(trigger_path)
    if not result['success']:
        return False
    
    try:
        buffer = result['buffer']
        buffer.seek(0)
        trigger_data = json.loads(buffer.read().decode('utf-8'))
        
        trigger_data['status'] = status
        trigger_data['updated_at'] = datetime.now().isoformat()
        
        if message:
            trigger_data['message'] = message
        if model_info:
            trigger_data['model_info'] = model_info
        
        updated_buffer = io.BytesIO(json.dumps(trigger_data).encode('utf-8'))
        return upload_file(updated_buffer, trigger_path)['success']
    except Exception as e:
        print(f"Error updating trigger: {e}")
        return False

# Check if training is needed
if db_conn:
    needs_training = should_retrain(db_conn)
    pending_models = get_pending_uploaded_models(db_conn)
    print(f"Training needed: {needs_training}")
    print(f"Found {len(pending_models)} pending models")
    
    # Check for trigger files
    trigger = check_for_triggers()
    if trigger:
        print("Found training trigger file")
        update_trigger_status(trigger['path'], 'processing', "Training started in Google Colab")
        force_training = True
    else:
        force_training = False

## Text Preprocessing and Model Definitions

In [None]:
# Import ML libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import sys
import coremltools as ct

# Download NLTK resources
for resource in ['punkt', 'stopwords', 'wordnet']:
    nltk.download(resource, quiet=True)

# Text preprocessing functions
def preprocess_text(text, remove_stopwords=True, lemmatize=True):
    if not text or not isinstance(text, str):
        return ""
    
    # Lowercase and tokenize
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) if remove_stopwords else set()
    
    # Process tokens
    processed_tokens = []
    lemmatizer = WordNetLemmatizer() if lemmatize else None
    
    for token in tokens:
        if not token.isalnum() or token in stop_words:
            continue
        if lemmatize and lemmatizer:
            token = lemmatizer.lemmatize(token)
        processed_tokens.append(token)
    
    return ' '.join(processed_tokens)

# Define IntentClassifier class
class IntentClassifier:
    def __init__(self):
        self.vectorizer = None
        self.model = None
        self.classes = None
        self.is_ensemble = False
        self.component_models = {}
        self.model_version = None
        self.training_date = None
        self.accuracy = None
        self.training_data_size = 0
    
    @property
    def is_trained(self):
        return self.model is not None and self.vectorizer is not None
    
    def train(self, data, user_message_col='user_message', intent_col='detected_intent',
             weight_col=None, test_size=0.2):
        # Preprocess text
        data['processed_message'] = data[user_message_col].apply(preprocess_text)
        
        # Split data
        X = data['processed_message']
        y = data[intent_col]
        weights = data[weight_col].values if weight_col in data.columns else None
        
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
            X, y, weights, test_size=test_size, random_state=42, 
            stratify=y if len(set(y)) > 1 else None
        )
        
        # Extract features
        self.vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE)
        X_train_vec = self.vectorizer.fit_transform(X_train)
        X_test_vec = self.vectorizer.transform(X_test)
        
        # Train model
        self.model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        self.model.fit(X_train_vec, y_train, sample_weight=w_train)
        
        # Evaluate
        y_pred = self.model.predict(X_test_vec)
        self.accuracy = accuracy_score(y_test, y_pred, sample_weight=w_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Set metadata
        self.classes = self.model.classes_.tolist()
        self.training_data_size = len(X_train)
        self.training_date = datetime.now().isoformat()
        timestamp = int(datetime.now().timestamp())
        self.model_version = f"{MODEL_VERSION_PREFIX}{timestamp}"
        
        print(f"Model trained with accuracy: {self.accuracy:.4f}")
        return {
            'accuracy': self.accuracy,
            'report': report,
            'classes': self.classes,
            'model_version': self.model_version,
            'training_data_size': self.training_data_size
        }
    
    def create_ensemble(self, uploaded_models, base_weight=BASE_MODEL_WEIGHT):
        if not self.is_trained or not uploaded_models:
            return False
        
        # Initialize ensemble with base model
        estimators = [('base', self.model)]
        self.component_models = {'base': 'Base model'}
        
        # Add placeholder models for each user model
        for idx, model in enumerate(uploaded_models):
            model_id = model.get('id', f'user{idx}')
            
            # Create compatible model
            n_features = self.vectorizer.get_feature_names_out().shape[0]
            user_model = RandomForestClassifier(n_estimators=50, random_state=42)
            
            # Fit with dummy data
            X_dummy = np.random.random((10, n_features))
            y_dummy = np.random.choice(self.classes, 10)
            user_model.fit(X_dummy, y_dummy)
            
            # Add to ensemble
            estimator_name = f'user{idx}'
            estimators.append((estimator_name, user_model))
            self.component_models[estimator_name] = {
                'id': model_id,
                'device_id': model.get('device_id', 'unknown')
            }
        
        # Create VotingClassifier
        weights = [base_weight] + [USER_MODEL_WEIGHT] * len(uploaded_models)
        self.model = VotingClassifier(estimators=estimators, voting='soft', weights=weights)
        self.is_ensemble = True
        return True
    
    def _convert_to_coreml(self, output_path):
        # Define prediction function
        def predict_intent(text):
            processed_text = preprocess_text(text)
            vec_text = self.vectorizer.transform([processed_text])
            intent = self.model.predict(vec_text)[0]
            probabilities = self.model.predict_proba(vec_text)[0]
            return intent, probabilities
        
        # Apply version compatibility fixes to coremltools if needed
        if 'coremltools' in sys.modules:
            import coremltools as ct
            # Monkey patch version checks if needed
            if hasattr(ct, '_dependency_check'):
                for check_name in ['verify_scikit_learn_version', 'verify_tensorflow_version', 
                                'verify_torch_version', 'verify_xgboost_version']:
                    if hasattr(ct._dependency_check, check_name):
                        setattr(ct._dependency_check, check_name, lambda *args, **kwargs: True)
        
        # Handle missing libcoremlpython module
        has_error = False
        try:
            # Try to import librarypy to check its availability
            import coremltools.libcoremlpython
        except ImportError:
            has_error = True
            print("Warning: Missing libcoremlpython - using simplified conversion")
        
        # Use try-except to handle potential conversion errors
        try:
            # Standard conversion with complete outputs
            coreml_model = ct.convert(
                predict_intent,
                inputs=[ct.TensorType(shape=(1,), dtype=str)],
                outputs=[
                    ct.TensorType(name='intent'),
                    ct.TensorType(name='probabilities', dtype=np.float32)
                ],
                classifier_config=ct.ClassifierConfig(self.classes)
            )
        except Exception as e:
            print(f"Standard conversion failed: {e}")
            # Try alternative simplified conversion
            try:
                coreml_model = ct.convert(
                    predict_intent,
                    inputs=[ct.TensorType(shape=(1,), dtype=str)],
                    outputs=[ct.TensorType(name='intent')]
                )
                print("Used simplified conversion due to errors")
            except Exception as e2:
                print(f"All conversion attempts failed: {e2}")
                return False
        
        # Add metadata
        coreml_model.user_defined_metadata['version'] = self.model_version
        coreml_model.user_defined_metadata['training_date'] = self.training_date
        coreml_model.user_defined_metadata['accuracy'] = str(self.accuracy)
        
        # Save model
        coreml_model.save(output_path)
        return True
        def predict_intent(text):
            processed_text = preprocess_text(text)
            vec_text = self.vectorizer.transform([processed_text])
            intent = self.model.predict(vec_text)[0]
            probabilities = self.model.predict_proba(vec_text)[0]
            return intent, probabilities
        
        # Convert to CoreML
        coreml_model = ct.convert(
            predict_intent,
            inputs=[ct.TensorType(shape=(1,), dtype=str)],
            outputs=[
                ct.TensorType(name='intent'),
                ct.TensorType(name='probabilities', dtype=np.float32)
            ],
            classifier_config=ct.ClassifierConfig(self.classes)
        )
        
        # Add metadata
        coreml_model.user_defined_metadata['version'] = self.model_version
        coreml_model.user_defined_metadata['training_date'] = self.training_date
        coreml_model.user_defined_metadata['accuracy'] = str(self.accuracy)
        
        # Save model
        coreml_model.save(output_path)
        return True
    
    def save(self, output_dir=None):
        # Create temp dir if not provided
        if output_dir is None:
            output_dir = tempfile.mkdtemp()
            
        # Define paths
        sklearn_path = os.path.join(output_dir, f"intent_classifier_{self.model_version}.joblib")
        info_path = os.path.join(output_dir, f"model_info_{self.model_version}.json")
        coreml_path = os.path.join(output_dir, f"model_{self.model_version}.mlmodel")
        
        # Save sklearn model
        joblib.dump((self.vectorizer, self.model), sklearn_path)
        
        # Create and save CoreML model
        self._convert_to_coreml(coreml_path)
        
        # Save model info
        model_info = {
            'version': self.model_version,
            'accuracy': self.accuracy,
            'training_data_size': self.training_data_size,
            'training_date': self.training_date,
            'is_ensemble': self.is_ensemble,
            'component_models': len(self.component_models) if self.component_models else 1,
            'classes': self.classes
        }
        
        with open(info_path, 'w') as f:
            json.dump(model_info, f)
        
        return {
            'sklearn_path': sklearn_path,
            'coreml_path': coreml_path,
            'info_path': info_path,
            'model_info': model_info
        }