In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the original, uncleaned sales prediction dataset
sales_data = pd.read_csv('sales.csv')

In [4]:
# Drop duplicates and fix inconsistencies in categorical data
sales_data.drop_duplicates(inplace=True)
sales_data['Item_Fat_Content'] = sales_data['Item_Fat_Content'].replace(['LF', 'low fat'], 'Low Fat')
sales_data['Item_Fat_Content'] = sales_data['Item_Fat_Content'].replace('reg', 'Regular')

In [5]:
# Identify the features (X) and target (y)
X = sales_data.drop(columns=['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier'])
y = sales_data['Item_Outlet_Sales']

In [6]:
# Perform a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create a preprocessing object to prepare the dataset for Machine Learning
preprocessing = [('imputer', SimpleImputer(strategy='mean'), numeric_cols),
                 ('scaler', StandardScaler(), numeric_cols)]

In [8]:
# Make sure your imputation of missing values occurs after the train test split using SimpleImputer
from sklearn.compose import ColumnTransformer

In [9]:
# Identify the numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [10]:
# Create the column transformer object
preprocessor = ColumnTransformer(transformers=preprocessing,
                                 remainder='passthrough',
                                 n_jobs=-1)

In [11]:
# Fit and transform the training data
X_train_prep = preprocessor.fit_transform(X_train)

In [12]:
# Transform the test data
X_test_prep = preprocessor.transform(X_test)

In [14]:
sales_data.duplicated().sum()

0

In [15]:
sales_data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [16]:
X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,Low Fat,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1
1,5.92,Regular,0.019278,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2
2,17.5,Low Fat,0.01676,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store
4,8.93,Low Fat,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1
