In [6]:
import pandas as pd

train_data=pd.read_csv('train_data.csv')
test_data=pd.read_csv('test_data.csv')

In [7]:
train_data

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,model_photography,price,price_2,page
0,2008,6,22,21,29,15648,3,C20,13,1,2,48,1,2
1,2008,5,19,6,29,10018,2,B26,13,3,1,57,1,2
2,2008,7,15,2,29,19388,3,C13,9,5,1,48,1,1
3,2008,5,2,2,29,7181,2,B11,2,4,1,43,2,1
4,2008,6,9,16,29,13493,2,B31,9,5,1,57,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132374,2008,7,4,3,29,17622,4,P19,2,1,1,48,1,2
132375,2008,6,19,9,29,15165,3,C26,14,3,1,28,2,2
132376,2008,7,15,4,29,19359,1,A4,3,2,2,38,2,1
132377,2008,7,28,16,29,21454,3,C50,9,5,2,20,2,3


## Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
train_data['page2_clothing_model']=le.fit_transform(train_data['page2_clothing_model'])

le1=LabelEncoder()
test_data['page2_clothing_model']=le1.fit_transform(test_data['page2_clothing_model'])

## Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

train_features = train_data[['page1_main_category', 'page2_clothing_model', 'colour']]
train_target = train_data['price']

test_features = test_data[['page1_main_category', 'page2_clothing_model', 'colour']]
test_target = test_data['price']

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

# REGRESSION

In [10]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV



model_params = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
            
        }
    },
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    }
}

In [11]:
reports = []

for name, config in model_params.items():
    model = config['model']
    param_grid = config['params']
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring="r2", n_jobs=-1)
        grid_search.fit(train_features, train_target)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(train_features, train_target)
        best_params = "Default Parameters"

    predictions = best_model.predict(test_features)
    rmse = root_mean_squared_error(test_target, predictions)
    mae = mean_absolute_error(test_target, predictions)
    r2 = r2_score(test_target, predictions)

    reports.append((name, best_model, best_params, rmse, mae, r2))

In [12]:
for name, model, best_params, rmse, mae, r2 in reports:
    print(f"Model: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print("\n")

Model: linear_regression
Best Parameters: Default Parameters
RMSE: 11.6929
MAE: 10.0456
R2 Score: 0.1364


Model: lasso
Best Parameters: {'alpha': 0.01}
RMSE: 11.6932
MAE: 10.0402
R2 Score: 0.1364


Model: ridge
Best Parameters: {'alpha': 10}
RMSE: 11.6929
MAE: 10.0454
R2 Score: 0.1364


Model: random_forest
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
RMSE: 2.6927
MAE: 0.4475
R2 Score: 0.9542


Model: gradient_boosting
Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
RMSE: 2.6024
MAE: 0.4879
R2 Score: 0.9572




# ML Flow

In [13]:
import mlflow

In [16]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Check if the experiment exists and is active
experiment_name = "Price Prediction (Regression)"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # Create a new experiment if it doesn't exist
    mlflow.create_experiment(experiment_name)
elif experiment.lifecycle_stage == "deleted":
    # Restore the experiment if it exists in the deleted state
    mlflow.tracking.MlflowClient().restore_experiment(experiment.experiment_id)

mlflow.set_experiment(experiment_name)

for name, model, best_params, rmse, mae, r2 in reports:
    with mlflow.start_run(run_name=name) as run:
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        
        if name == "Linear Regressor":
            mlflow.sklearn.log_model(model, "linear_model")
        elif name == "Ridge Regressor":
            mlflow.sklearn.log_model(model, "ridge_model")
        elif name == "Lasso Regressor":
            mlflow.sklearn.log_model(model, "lasso_model")
        elif name == "Gradient Boosting Regressor":
            mlflow.sklearn.log_model(model, "gradient_boosting_model")
        elif name == "Random Forest Regressor":
            mlflow.sklearn.log_model(model, "random_forest_model")
        else:
            pass

🏃 View run linear_regression at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/9ebe523c620e4daa9a777dc1bc41974a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716
🏃 View run lasso at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/dd8a9a4fc9b24558be201d4aa7b8e40a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716
🏃 View run ridge at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/a1d8f43ee7d7474dbb7930ac5587e250
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716
🏃 View run random_forest at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/c877c08f745b4314ace2291268e9dba7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716
🏃 View run gradient_boosting at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/58c01ddc97fa42a0a0fbfbbcaac24673
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716


In [17]:
import mlflow

# Define Model Name and Run ID
model_name = "Gradient Boosting Regressor"
run_id = "58c01ddc97fa42a0a0fbfbbcaac24673"
model_uri = f"runs:/{run_id}/gradient_boosting_model"

# Start an MLflow run and register the model
with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

print(f"Model '{model_name}' registered successfully!")


Successfully registered model 'Gradient Boosting Regressor'.
2025/03/24 11:00:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Gradient Boosting Regressor, version 1


🏃 View run gradient_boosting at: http://127.0.0.1:5000/#/experiments/360554687977783716/runs/58c01ddc97fa42a0a0fbfbbcaac24673
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/360554687977783716
Model 'Gradient Boosting Regressor' registered successfully!


Created version '1' of model 'Gradient Boosting Regressor'.


In [None]:
import pickle

with open('le1_clothing_model.pkl', 'wb') as f:
    pickle.dump(le, f)

with open('le2_clothing_model.pkl', 'wb') as f:
    pickle.dump(le1, f)

with open('regression_standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('gradient_boosting_model.pkl', 'wb') as f:
    pickle.dump(model, f)