In [1]:
# Import necessary libraries
import requests  # For downloading dataset
import pandas as pd  # For handling data
import torch  # For model operations
import transformers  # For using a pre-trained model from HuggingFace
import numpy as np
import sklearn.metrics  # For evaluating model performance
import zipfile
import io
from sklearn.preprocessing import LabelEncoder
import joblib
from huggingface_hub import hf_hub_download
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. DATA INGESTION COMPONENT
class DataIngestion:
    def __init__(self, dataset_url):
        self.dataset_url = dataset_url
    
    def download_data(self):
        # Download dataset from online source
        response = requests.get(self.dataset_url)
        # Unzip and load the datasets into pandas
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            # Load the 'u.data' file (user ratings)
            with z.open('ml-100k/u.data') as f:
                ratings = pd.read_csv(f, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

            # Load the 'u.item' file (movie information)
            with z.open('ml-100k/u.item') as f:
                items = pd.read_csv(f, sep='|', header=None, encoding='latin-1', names=['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

            # Load the 'u.user' file (user information)
            with z.open('ml-100k/u.user') as f:
                users = pd.read_csv(f, sep='|', header=None, encoding='latin-1', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

            merged_data = pd.merge(ratings, items, on='item_id')
            # Merge the result with users
            data = pd.merge(merged_data, users, on='user_id')

        return data

In [2]:
# 2. DATA PREPROCESSING COMPONENT
class DataPreprocessor:
    def __init__(self, data):
        self.data = data
    
    def clean_data(self):
        # Drop unnecessary columns
        relevant_columns = ['title', 'age', 'gender', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
        cleaned_data = self.data[relevant_columns]
        
        # Remove rows with any null values
        cleaned_data = cleaned_data.dropna()
        # Encode gender: 'F' -> 0, 'M' -> 1
        cleaned_data['gender'] = cleaned_data['gender'].map({'F': 0, 'M': 1})
        
        # Encode title
        # Initialize LabelEncoder
        label_encoder = LabelEncoder()

        # Apply label encoding to the 'title' column
        cleaned_data['title'] = label_encoder.fit_transform(cleaned_data['title'])

        cleaned_data = cleaned_data[(cleaned_data['title'] >= 0) & (cleaned_data['title'] <= 100)]
   
        return cleaned_data

In [3]:
# 3. MODEL COMPONENT
class PretrainedModel:
    def __init__(self, model_name):
        with open(model_name, 'rb') as f:
            self.model = joblib.load(f)

    def predict(self, inputs):
        inputs = inputs.drop(columns=['title'])
        return self.model.predict(inputs)

In [4]:
# 4. SYSTEM COMPONENT - PIPELINE
class MLPipeline:
    def __init__(self, dataset_url, model_name):
        self.data_ingestion = DataIngestion(dataset_url)
        self.preprocessor = None
        self.model = PretrainedModel(model_name)
        self.data = None
    
    def build_pipeline(self):
        # Step 1: Download and load data
        self.data = self.data_ingestion.download_data()
        
        # Step 2: Preprocess data
        self.preprocessor = DataPreprocessor(self.data)
        cleaned_data = self.preprocessor.clean_data()
        
        # Step 3: Pass data through the model for prediction
        predictions = self.model.predict(cleaned_data)
        
        return predictions


In [5]:
# 5. FEEDBACK COMPONENT
class FeedbackLoop:
    def __init__(self, predictions, true_labels, dataset_url):
        self.predictions = predictions
        self.true_labels = true_labels
        self.data_ingestion = DataIngestion(dataset_url)
    
    def simulate_user_feedback(self):
        # actual accuracy of the model
        accuracy = accuracy_score(self.true_labels, self.predictions)

        # Step 1: Download and load data for training
        self.data = self.data_ingestion.download_data()
        
        # Step 2: Preprocess data
        self.preprocessor = DataPreprocessor(self.data)
        df = self.preprocessor.clean_data()
        
        # Define feature columns and target variable
        X = df.drop(columns=['title'])  # Features (excluding 'title')
        y = df['title']  # Target variable (title)
        
        # Split the dataset into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Initialize and train the Gradient Boosting Classifier
        model = GradientBoostingClassifier(random_state=42)
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy_trained = accuracy_score(y_test, y_pred)
        
        feedback = "improve model" if accuracy < accuracy_trained else "model is performing well"
        return feedback
    
    def adjust_model_based_on_feedback(self, feedback):
        # Simulate adjustments in the system based on user feedback
        if feedback == 'improve model':
            print("Feedback received: Retraining model is performing better, upload retrained model to huggingface")
            #in this section you should push your retrained model to huggingface so you will have updated model and can use it in the new prediction
        else:
            print("Feedback received: No major changes needed")

In [6]:
def get_true_labels(dataset_url):
    data_ingestion = DataIngestion(dataset_url)
    data = data_ingestion.download_data()
    
    preprocessor = DataPreprocessor(data)
    cleaned_data = preprocessor.clean_data()
    
    # Extract true labels from the 'title' column
    true_labels = np.array(cleaned_data['title'])
    
    return true_labels

In [7]:
# 6. SYSTEM EXECUTION
def main():
    # Initialize pipeline with MovieLens 100K dataset and HuggingFace model
    dataset_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
    model_name = hf_hub_download(repo_id='bnamazci/gradient-boosting-model2', filename='gradient_boosting_model.joblib')
    
    # Step 1: Build and execute the pipeline
    pipeline = MLPipeline(dataset_url, model_name)
    predictions = pipeline.build_pipeline()

    true_labels = get_true_labels(dataset_url)
    feedback_loop = FeedbackLoop(predictions, true_labels, dataset_url)
    
    # Step 3: Handle feedback and adjust system
    feedback = feedback_loop.simulate_user_feedback()
    feedback_loop.adjust_model_based_on_feedback(feedback)

# Run the main system
main()

Feedback received: No major changes needed
