# Pre training a model for app 

## Pre-training a model for Regression tasks using pycaret

In [None]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pycaret.classification import setup as cls_setup, compare_models as cls_compare, save_model as cls_save, pull as cls_pull
from pycaret.regression import setup as reg_setup, compare_models as reg_compare, save_model as reg_save, pull as reg_pull


In [7]:
# 📌 Step 2: Load Your Dataset (replace with your file)
# For Excel files, use read_excel instead of read_csv
df = pd.read_excel("/datasets/Sample_Superstore.xlsx")  # e.g., sales data or business data
df.head()

Unnamed: 0,A`,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country/Region,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,4962,CA-2016-156587,2016-03-07,2016-03-08,First Class,AB-10015,Aaron Bergman,Consumer,United States,Seattle,...,98103.0,West,FUR-CH-10004477,Furniture,Chairs,"Global Push Button Manager's Chair, Indigo",48.712,1,0.2,5.4801
1,4963,CA-2016-156587,2016-03-07,2016-03-08,First Class,AB-10015,Aaron Bergman,Consumer,United States,Seattle,...,98103.0,West,OFF-AR-10001427,Office Supplies,Art,Newell 330,17.94,3,0.0,4.6644
2,4964,CA-2016-156587,2016-03-07,2016-03-08,First Class,AB-10015,Aaron Bergman,Consumer,United States,Seattle,...,98103.0,West,OFF-ST-10002344,Office Supplies,Storage,"Carina 42""Hx23 3/4""W Media Storage Unit",242.94,3,0.0,4.8588
3,8223,CA-2016-152905,2016-02-18,2016-02-24,Standard Class,AB-10015,Aaron Bergman,Consumer,United States,Arlington,...,76017.0,Central,OFF-ST-10000321,Office Supplies,Storage,Akro Stacking Bins,12.624,2,0.2,-2.5248
4,8802,CA-2018-140935,2018-11-10,2018-11-12,First Class,AB-10015,Aaron Bergman,Consumer,United States,Oklahoma City,...,73120.0,Central,TEC-PH-10000562,Technology,Phones,Samsung Convoy 3,221.98,2,0.0,62.1544


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   A`              9994 non-null   int64         
 1   Order ID        9994 non-null   object        
 2   Order Date      9994 non-null   datetime64[ns]
 3   Ship Date       9994 non-null   datetime64[ns]
 4   Ship Mode       9994 non-null   object        
 5   Customer ID     9994 non-null   object        
 6   Customer Name   9994 non-null   object        
 7   Segment         9994 non-null   object        
 8   Country/Region  9994 non-null   object        
 9   City            9994 non-null   object        
 10  State           9994 non-null   object        
 11  Postal Code     9983 non-null   float64       
 12  Region          9994 non-null   object        
 13  Product ID      9994 non-null   object        
 14  Category        9994 non-null   object        
 15  Sub-

In [None]:
# # 📦 Step 1: Install PyCaret
# !pip install pycaret --quiet


In [None]:
# 🧪 Step 2: Import libraries
import pandas as pd
from pycaret.regression import setup, compare_models, save_model, pull

In [14]:
# 📥 Step 3: Dataset is already loaded as 'df' in a previous cell.
# 📥 Step 3: Load your dataset
df = pd.read_excel('/datasets/Sample_Superstore.xlsx')  # Replace with your actual CSV filename


In [15]:
# 🧹 Step 4: Keep only selected features + target
df = df[['Country/Region', 'City', 'State', 'Category', 'Sub-Category', 'Product Name', 'Sales']]
df = df.dropna()  # Optional: clean missing values if needed


In [17]:
# 🧠 Step 5: Setup PyCaret for Regression
reg = setup(
    data=df,
    target='Sales',
    session_id=42,
    normalize=True,
    categorical_features=[
        'Country/Region', 'City', 'State', 'Category', 'Sub-Category', 'Product Name'
    ]
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Sales
2,Target type,Regression
3,Original data shape,"(9994, 7)"
4,Transformed data shape,"(9994, 25)"
5,Transformed train set shape,"(6995, 25)"
6,Transformed test set shape,"(2999, 25)"
7,Categorical features,6
8,Preprocess,True
9,Imputation type,simple


In [19]:
%%time
# 🚀 Step 6: Train and select the best regression model
best_model = compare_models()
results = pull()
print(results)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
par,Passive Aggressive Regressor,145.0732,211772.1242,428.2635,0.538,1.1691,2.8211,0.374
gbr,Gradient Boosting Regressor,135.101,211231.6501,427.1187,0.5373,0.9501,1.6672,1.261
huber,Huber Regressor,142.2021,213808.1282,430.5726,0.5318,1.1264,2.6626,0.424
en,Elastic Net,158.0628,215177.3712,433.0336,0.5309,1.3316,3.8601,0.363
lasso,Lasso Regression,160.7472,212245.4525,428.81,0.5303,1.2895,3.6951,0.427
llar,Lasso Least Angle Regression,160.75,212245.9475,428.8111,0.5303,1.2895,3.6953,0.404
br,Bayesian Ridge,161.5685,212493.1286,429.1021,0.5295,1.3,3.7453,0.422
ridge,Ridge Regression,161.6082,212541.9839,429.158,0.5294,1.3002,3.7462,0.376
lr,Linear Regression,162.0159,212721.7742,429.3096,0.5291,1.3017,3.7819,1.795
rf,Random Forest Regressor,137.0031,215625.8801,430.3965,0.5285,0.8938,1.4801,4.738


                                    Model           MAE           MSE  \
par          Passive Aggressive Regressor  1.450732e+02  2.117721e+05   
gbr           Gradient Boosting Regressor  1.351010e+02  2.112317e+05   
huber                     Huber Regressor  1.422021e+02  2.138081e+05   
en                            Elastic Net  1.580628e+02  2.151774e+05   
lasso                    Lasso Regression  1.607472e+02  2.122455e+05   
llar         Lasso Least Angle Regression  1.607500e+02  2.122459e+05   
br                         Bayesian Ridge  1.615685e+02  2.124931e+05   
ridge                    Ridge Regression  1.616082e+02  2.125420e+05   
lr                      Linear Regression  1.620159e+02  2.127218e+05   
rf                Random Forest Regressor  1.370031e+02  2.156259e+05   
omp           Orthogonal Matching Pursuit  1.582357e+02  2.154262e+05   
knn                 K Neighbors Regressor  1.371480e+02  2.183109e+05   
et                  Extra Trees Regressor  1.399693

In [20]:
# 💾 Step 7: Save the model to a pickle file
save_model(best_model, 'sales_model')

print("✅ Model training complete. Model saved as 'sales_model.pkl'")


Transformation Pipeline and Model Successfully Saved
✅ Model training complete. Model saved as 'sales_model.pkl'


# Pre-training a model for classification task 

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import joblib


In [None]:
# Select required columns
df = df[['Country/Region', 'City', 'State', 'Category', 'Sub-Category', 'Sales', 'Product Name']].copy()

# Define X and y
X = df.drop('Product Name', axis=1)
y = df['Product Name']


In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Column types
categorical_cols = ['Country/Region', 'City', 'State', 'Category', 'Sub-Category']
numeric_cols = ['Sales']

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])


In [51]:
%%time
# Combine preprocessing with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Hyperparameters to tune
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2]
}

# Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


# It takes a lot of time to train a model with hyperparameter tuning.

In [52]:
# Save tuned model
joblib.dump(grid_search.best_estimator_, 'rf_product_classifier.pkl')
print("✅ Tuned Random Forest model saved as 'rf_product_classifier.pkl'")


✅ Tuned Random Forest model saved as 'rf_product_classifier.pkl'
