In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error

In [2]:
file_path = '/content/DS_ML Coding Challenge Dataset (1).xlsx'
train_df = pd.read_excel(file_path, sheet_name='Training Dataset')
test_df = pd.read_excel(file_path, sheet_name='Test Dataset')

In [3]:
train_df['Month of Sourcing'] = pd.to_datetime(train_df['Month of Sourcing'])
test_df['Month of Sourcing'] = pd.to_datetime(test_df['Month of Sourcing'])

In [4]:
X_train = train_df.drop(['Sourcing Cost'], axis=1)
y_train = train_df['Sourcing Cost']
X_test = test_df.drop(['Sourcing Cost'], axis=1)
y_test = test_df['Sourcing Cost']

In [5]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [6]:
categorical_features = X_train.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [8]:
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
rmse_lr = mean_squared_error(y_test, lr_pred, squared=False)
print("RMSE for Linear Regression:", rmse_lr)

RMSE for Linear Regression: 41.35721899199446


In [9]:
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor())])
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rmse_rf = mean_squared_error(y_test, rf_pred, squared=False)
print("RMSE for Random Forest:", rmse_rf)

RMSE for Random Forest: 38.52652718116699


In [10]:
ets_model = ExponentialSmoothing(y_train, trend=None, seasonal=None, seasonal_periods=None, damped=False)
ets_fit = ets_model.fit(smoothing_level=0.6)
ets_pred = ets_fit.forecast(len(X_test))
rmse_ets = mean_squared_error(y_test, ets_pred, squared=False)
print("RMSE for Exponential Smoothing:", rmse_ets)

  ets_model = ExponentialSmoothing(y_train, trend=None, seasonal=None, seasonal_periods=None, damped=False)


RMSE for Exponential Smoothing: 61.89972058287732
