In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("US_Regional_Sales_Data.csv")

In [None]:
df.head()

In [None]:
df["ProcuredDate"] = pd.to_datetime(df['ProcuredDate'])
df["OrderDate"] = pd.to_datetime(df['OrderDate'])
df["ShipDate"] = pd.to_datetime(df['ShipDate'])
df["DeliveryDate"] = pd.to_datetime(df['DeliveryDate'])

In [None]:
df.info()

In [None]:
df['day_of_week_ProcuredDate'] = df['ProcuredDate'].dt.day_name()
df['day_of_week_OrderDate'] = df['OrderDate'].dt.day_name()
df['day_of_week_ShipDate'] = df['ShipDate'].dt.day_name()
df['day_of_week_DeliveryDate'] = df['DeliveryDate'].dt.day_name()


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Unit Cost'] = df['Unit Cost'].str.replace(',', '')

df['Unit Cost'] = pd.to_numeric(df['Unit Cost'])


df['Unit Price'] = df['Unit Price'].str.replace(',', '')

df['Unit Price'] = pd.to_numeric(df['Unit Price'])

In [None]:
df.info()

In [None]:
drop_columns = ['OrderNumber', 'ProcuredDate', 'OrderDate', 'ShipDate', 'DeliveryDate']

df.drop(drop_columns, axis=1, inplace=True)


In [None]:
df.columns

In [None]:
columns_to_encode = ['Sales Channel', 'WarehouseCode','CurrencyCode','day_of_week_ProcuredDate', 'day_of_week_OrderDate',
       'day_of_week_ShipDate', 'day_of_week_DeliveryDate']

# Loop through columns and apply label encoding
label_encoder = LabelEncoder()
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sns.distplot(df['Unit Price'],kde=False,bins=30)

In [None]:
plt.figure(figsize=(30, 10))  # Set the figure size

sns.heatmap(df.corr(),cmap='coolwarm',annot=True)


In [None]:
sns.barplot(x='Sales Channel',y='Unit Price',data=df,estimator=np.std)


## Model

In [None]:
X = df.drop(['Unit Price'], axis=1)  # Features
y = df['Unit Price']  # Target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100 , max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred) 

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

In [None]:
from sklearn.metrics import accuracy_score

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")


### To calculate the Bias and variance 

In [None]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [None]:
bias = mean_squared_error(y_train, y_train_pred)
variance = mean_squared_error(y_test, y_test_pred)

print(f'bias: {bias}')
print(f'variance: {variance}')


### To calculate the accuracy for training and test to know the overfitting and underfitting

In [None]:
# Plotting actual vs. predicted values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.show()

### If i have a validation how to train the data

In [None]:
x_train , x_fold , y_train , y_fold = train_test_split(X,y , test_size= 0.2 , random_state= 40)
x_val , x_test , y_val , y_test = train_test_split(x_fold , y_fold ,test_size= 0.5 , random_state= 40 )

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Predict on the validation set
y_val_pred = rf_model.predict(x_val)

# Evaluate the model
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_val_pred)

# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')

In [None]:
train_accuracy = r2_score(y_train, y_train_pred)
val_accuracy = r2_score(y_val, y_val_pred)

print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


df_train --> X_train , y_train
df_test --> X_test , y_test
