# MSA 2024 Phase 2 - Part 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 1. Load and split preprocessed data

In [2]:
# Load the dataset
data = pd.read_csv('D:/Download/store_sales_utf8.csv', encoding='utf-8')

# Display the first ten instances
print(data.head(10))

# Display key statistical metrics
print(data.describe())

# Show data types of each column
print(data.dtypes)

# Choose the target column
target_column = 'Profit'  # Replace with the actual target column name

# Split data into features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Encode categorical variables if any
X = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

## 2. Choose an algorithm

In [None]:
# Initialize the linear regression model
model = LinearRegression()


## 3. Train and test a model

In [None]:
# Train the model
model.fit(X_train, y_train)

# Predict on training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Print training and testing accuracy
print("Training set accuracy (R^2):", model.score(X_train, y_train))
print("Test set accuracy (R^2):", model.score(X_test, y_test))


## 4. Evaluate the model 

In [None]:
# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Train MSE:", mse_train)
print("Test MSE:", mse_test)
print("Train MAE:", mae_train)
print("Test MAE:", mae_test)
print("Train R^2:", r2_train)
print("Test R^2:", r2_test)

# Visualize actual vs predicted values
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Train set")

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Test set")

plt.tight_layout()
plt.show()


## 5. Summary

In this notebook, we performed the following steps:

1. Loaded the preprocessed dataset and split it into training and test sets.
2. Chose a linear regression algorithm for our model.
3. Trained the model on the training set and made predictions on the test set.
4. Evaluated the model using metrics such as MSE, MAE, and R^2.
5. Visualized the actual vs. predicted values for both the training and test sets.

Our linear regression model performed reasonably well on the test set, as indicated by the evaluation metrics. The R^2 score suggests that the model explains a significant portion of the variance in the target variable. However, there is room for improvement, which can be achieved by experimenting with different algorithms, tuning hyperparameters, and incorporating more features or data.

### Next Steps

1. Experiment with different machine learning algorithms, such as decision trees, random forests, or gradient boosting.
2. Perform hyperparameter tuning to optimize the model's performance.
3. Explore feature engineering techniques to create more informative features.
4. Gather more data to train the model, if possible.
5. Regularly validate the model's performance on new data to ensure its generalizability.

