In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the supply chain dataset
supply_chain_df = pd.read_csv('../supply_chain_data.csv')

# Drop rows with missing values or impute if necessary (Here, we drop for simplicity)
supply_chain_df = supply_chain_df.dropna()

# Select relevant features and target variable for prediction (e.g., predicting 'Shipping costs')
features = ['Lead times', 'Stock levels', 'Order quantities', 'Shipping times', 'Shipping carriers']
target = 'Shipping costs'

# Encode categorical columns like 'Shipping carriers'
# We will use OneHotEncoder for 'Shipping carriers'
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Shipping carriers']),
        ('num', StandardScaler(), ['Lead times', 'Stock levels', 'Order quantities', 'Shipping times'])
    ], 
    remainder='passthrough'  # Keep the other columns unchanged
)

# Split data into features (X) and target variable (y)
X = supply_chain_df[features]
y = supply_chain_df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a pipeline with column transformer and model
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE) for Shipping Costs Prediction: {mse}')

Mean Squared Error (MSE) for Shipping Costs Prediction: 7.167927578356152


In [37]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib

# Load dataset
df = pd.read_csv('../supply_chain_data.csv')

# Define features and target
target = 'Revenue generated'
features = df.drop(columns=[target])
X = features
y = df[target]

# Define categorical and numerical columns
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(exclude='object').columns.tolist()

# Build pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train and save model
pipeline.fit(X, y)
joblib.dump(pipeline, 'supply_chain_model.pkl')


['supply_chain_model.pkl']