In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import joblib
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the Excel file (skipping the first 3 rows containing the Adidas logo)
file_path = '/data/
data = pd.read_excel(file_path, skiprows=4)  # Skip first 3 rows (index starts at 0)

# Preview the data
data.head()

In [None]:
# Selecting the relevant features (adjust columns according to your dataset)
X = data[['Price per Unit', 'Units Sold', 'Operating Profit']]
y = data['Total Sales']

In [None]:
linkcode
# Scaling the features (StandardScaler is used for this example)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test 

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Model parameters to optimize
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    
    # Create a linear regression model with the trial parameters
    model = LinearRegression(fit_intercept=fit_intercept)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

# Create the Optuna study and optimize it
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best parameters from Optuna
best_params = study.best_params
print("Best Parameters: ", best_params)

In [None]:
# Train the final model with the best parameters
final_model = LinearRegression(**best_params)
final_model.fit(X_train, y_train)

In [None]:
# Calculate final model performance
final_mse = mean_squared_error(y_test, final_model.predict(X_test))
print(f"Final Mean Squared Error (MSE): {final_mse}")

In [None]:
# Save the trained model
model_file_path = 'linear_regression_model.pkl'
joblib.dump(final_model, model_file_path)

In [None]:
# Load the saved model
saved_model = joblib.load('linear_regression_model.pkl')

# Using test data from the dataset for predictions
example_data = X_test[:5]  # Using the first 5 rows of the test set
actual_values = y_test[:5]

# Making predictions
predicted_values = saved_model.predict(example_data)

# Comparing the actual vs predicted values
comparison_df = pd.DataFrame({
    'Actual Values': actual_values,
    'Predicted Values': predicted_values
})

# Display the comparison
print("Comparison of Actual vs Predicted Values:")
print(comparison_df)


# Further Analysis: Predictions vs Actual for a random test sample
sample_test = X_test[10:15]  # Select a random subset from the test set
sample_actual = y_test[10:15]

sample_predicted = saved_model.predict(sample_test)

# Creating a DataFrame for comparison
sample_comparison_df = pd.DataFrame({
    'Actual Sales': sample_actual,
    'Predicted Sales': sample_predicted
})

print("Sample Comparison for Random Test Data:")
print(sample_comparison_df)