In [None]:
import os
import pandas as pd

# Data loading function with error handling
def load_data(file_path, columns):
    data = []
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            try:
                parts = line.strip().split(';')
                if len(parts) == len(columns):
                    data.append(parts)
                else:
                    print(f"Skipping line {line_number}: {line.strip()}")
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")
    return pd.DataFrame(data, columns=columns)

# Load data
product_categories = load_data('data/Product_Categories.txt', ['product_id', 'category']) # Load data

product_explanation = load_data('data/Product_Explanation.txt', ['product_id', 'description']) # Load data



In [None]:

# basic function for text preprocessing

def preprocess_text(text):
    text = text.lower()
    # Remove punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text


In [None]:
data.to_csv('data/processed_data.csv', index=False)
print("Data preprocessing completed and saved to 'data/processed_data.csv'.")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import joblib

# Load the preprocessed data
data = pd.read_csv('data/processed_data.csv')

# Handle missing values
data['processed_description'] = data['processed_description'].fillna('')
data['category'] = data['category'].fillna('unknown')  # Replace NaNs in category with a placeholder


In [None]:
# Remove categories with fewer than 2 samples
category_counts = data['category'].value_counts()
data = data[data['category'].isin(category_counts[category_counts > 1].index)]

# Further remove categories with very few samples (e.g., fewer than 5 samples)
min_samples = 5
data = data[data['category'].isin(category_counts[category_counts >= min_samples].index)]


In [None]:
# Check the number of classes
num_classes = data['category'].nunique()
print(f"Number of classes: {num_classes}")

# Ensure the training set size is at least equal to the number of classes
min_train_size = num_classes
min_data_points = 2 * num_classes
if len(data) < min_data_points:
    raise ValueError(f"Insufficient data: The dataset must have at least {min_data_points} samples to ensure proper splitting.")


In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['processed_description'])
y = data['category']


In [None]:
# Split the data into training and testing sets
test_size = 1 - (min_train_size / len(data))  # Adjust test size to ensure train size is sufficient
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

# Check the distribution of categories
print("Training set category distribution:")
print(y_train.value_counts())
print("\nTesting set category distribution:")
print(y_test.value_counts())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Define models and hyperparameters
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}


param_grids = {
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import joblib

# Perform GridSearchCV
best_models = {}
# Dynamically set n_splits based on the minimum number of samples in any class
n_splits = max(2, min(5, min(y_train.value_counts())))  # Ensure n_splits is at least 2
stratified_k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for model_name in models:
    print(f"\nTraining {model_name}...")
    grid_search = GridSearchCV(models[model_name], param_grids[model_name], cv=stratified_k_fold, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    y_pred = best_models[model_name].predict(X_test)
    print(classification_report(y_test, y_pred, zero_division=0))


In [None]:
# Save the best model and the vectorizer
best_model = best_models['GradientBoosting']  # Choose the best model based on performance
joblib.dump(best_model, 'models/model.pkl')
joblib.dump(vectorizer, 'models/vectorizer.pkl')

print("Model training completed and saved to 'models/' directory.")


In [None]:
from flask import Flask, request, jsonify
import joblib
import os

# Load the model 
model = joblib.load('models/model.pkl')
vectorizer = joblib.load('models/vectorizer.pkl')

#imitialize
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    # Get the product description from the request
    data = request.get_json(force=True)
    description = data['description']
    
    # Preprocess the description and make a prediction
    processed_description = vectorizer.transform([description])
    prediction = model.predict(processed_description)
    
    # Create a response with the prediction
    response = {
        'description': description,
        'category': prediction[0]
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)


In [None]:
# Use an official Python runtime as a parent image
FROM python:3.8-slim

# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install any needed packages specified in requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Make port 5000 available to the world outside this container
EXPOSE 5000

# Define environment variable
ENV NAME World

# Run inference.py when the container launches
CMD ["python", "inference.py"]
