In [None]:
import pandas as pd

# Load the new sales dataset
file_path = "/content/sales.csv"
df = pd.read_csv(file_path)

# Display the first few rows and column names
df.head(), df.columns


(   SaleID    SaleDate  SalesAmount  NumberOfProductsSold  \
 0       1  2023-01-01       357694                  4488   
 1       2  2023-01-02       284989                  4556   
 2       3  2023-02-01       266333                  3949   
 3       4  2023-03-03       288180                  3954   
 4       5  2023-04-02       306633                  3945   
 
    MarketingExpenditure   Region  
 0                 13220    North  
 1                 14884     East  
 2                 15465    South  
 3                 15487     West  
 4                 15055  Central  ,
 Index(['SaleID', 'SaleDate', 'SalesAmount', 'NumberOfProductsSold',
        'MarketingExpenditure', 'Region'],
       dtype='object'))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
df['SaleDate'] = pd.to_datetime(df['SaleDate'])
df['Month'] = df['SaleDate'].dt.month

In [None]:
features = ['NumberOfProductsSold', 'MarketingExpenditure', 'Month', 'Region']
target = 'SalesAmount'

In [None]:
X=df[features]
y=df[target]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), ['Region'])
    ],
    remainder='passthrough'  # Keep the other numerical columns
)


In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming you have a trained model and test data X_test
y_pred = model.predict(X_test)

# Now calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)


RMSE: 32378.32001825837
