In [None]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder



In [None]:
# Importing the dataset
# Replace 'dataset.csv' with the path to your dataset
data = pd.read_csv('supermarket_sales.csv')

In [None]:
# Displaying basic details
print("Basic Details of the Dataset:")
print("Number of rows:", data.shape[0])
print("Number of columns:", data.shape[1])
print("Data types:\n", data.dtypes)
print("\nFirst 5 rows of the dataset:\n", data.head())


Basic Details of the Dataset:
Number of rows: 1000
Number of columns: 17
Data types:
 Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object

First 5 rows of the dataset:
     Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Me

In [None]:
# Separating numerical and categorical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

print("\nNumerical Columns:", list(numerical_columns))
print("Categorical Columns:", list(categorical_columns))



Numerical Columns: ['Unit price', 'Quantity', 'Tax 5%', 'Total', 'cogs', 'gross margin percentage', 'gross income', 'Rating']
Categorical Columns: ['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Date', 'Time', 'Payment']


In [None]:
# Applying Min-Max Scaling (Normalization)
scaler = MinMaxScaler()
data_scaled = data.copy()  # Making a copy of the dataset for scaled values
data_scaled[numerical_columns] = scaler.fit_transform(data[numerical_columns])

print("\nDataset after applying Min-Max Scaling (first 5 rows):")
print(data_scaled.head())



Dataset after applying Min-Max Scaling (first 5 rows):
    Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Member    Male   
4  373-73-7910      A     Yangon        Normal    Male   

             Product line  Unit price  Quantity    Tax 5%     Total  \
0       Health and beauty    0.718847  0.666667  0.521616  0.521616   
1  Electronic accessories    0.057855  0.444444  0.067387  0.067387   
2      Home and lifestyle    0.403316  0.666667  0.319628  0.319628   
3       Health and beauty    0.535603  0.777778  0.463549  0.463549   
4       Sports and travel    0.848131  0.666667  0.604377  0.604377   

        Date   Time      Payment      cogs  gross margin percentage  \
0   1/5/2019  13:08      Ewallet  0.521616                      0.0   
1   3/8/2019  10:29       

In [None]:
# Ensure categorical columns are correctly extracted as a DataFrame
categorical_data = data_scaled[categorical_columns]

# Applying One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')
# Avoid multicollinearity by dropping the first category
encoded = encoder.fit_transform(categorical_data)

# Creating a DataFrame for encoded features
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns, index=categorical_data.index)

# Concatenating encoded features with the scaled dataset
data_preprocessed = pd.concat([data_scaled.drop(categorical_columns, axis=1), encoded_df], axis=1)

print("\nDataset after applying One-Hot Encoding (first 5 rows):")
print(data_preprocessed.head())



Dataset after applying One-Hot Encoding (first 5 rows):
   Unit price  Quantity    Tax 5%     Total      cogs  \
0    0.718847  0.666667  0.521616  0.521616  0.521616   
1    0.057855  0.444444  0.067387  0.067387  0.067387   
2    0.403316  0.666667  0.319628  0.319628  0.319628   
3    0.535603  0.777778  0.463549  0.463549  0.463549   
4    0.848131  0.666667  0.604377  0.604377  0.604377   

   gross margin percentage  gross income    Rating  Invoice ID_101-81-4070  \
0                      0.0      0.521616  0.850000                     0.0   
1                      0.0      0.067387  0.933333                     0.0   
2                      0.0      0.319628  0.566667                     0.0   
3                      0.0      0.463549  0.733333                     0.0   
4                      0.0      0.604377  0.216667                     0.0   

   Invoice ID_102-06-2002  ...  Time_20:48  Time_20:50  Time_20:51  \
0                     0.0  ...         0.0         0.0       

In [None]:
print("\nOriginal Dataset (first 5 rows):")
print(data.head())

print("\nPreprocessed Dataset (first 5 rows):")
print(data_preprocessed.head())



Original Dataset (first 5 rows):
    Invoice ID Branch       City Customer type  Gender  \
0  750-67-8428      A     Yangon        Member  Female   
1  226-31-3081      C  Naypyitaw        Normal  Female   
2  631-41-3108      A     Yangon        Normal    Male   
3  123-19-1176      A     Yangon        Member    Male   
4  373-73-7910      A     Yangon        Normal    Male   

             Product line  Unit price  Quantity   Tax 5%     Total       Date  \
0       Health and beauty       74.69         7  26.1415  548.9715   1/5/2019   
1  Electronic accessories       15.28         5   3.8200   80.2200   3/8/2019   
2      Home and lifestyle       46.33         7  16.2155  340.5255   3/3/2019   
3       Health and beauty       58.22         8  23.2880  489.0480  1/27/2019   
4       Sports and travel       86.31         7  30.2085  634.3785   2/8/2019   

    Time      Payment    cogs  gross margin percentage  gross income  Rating  
0  13:08      Ewallet  522.83                 4.761

In [None]:
# Checking memory usage before and after preprocessing
original_memory = data.memory_usage(deep=True).sum()
preprocessed_memory = data_preprocessed.memory_usage(deep=True).sum()

print("\nMemory Usage Analysis:")
print(f"Memory Usage Before Preprocessing: {original_memory} bytes")
print(f"Memory Usage After Preprocessing: {preprocessed_memory} bytes")
print(f"Difference in Memory Usage: {preprocessed_memory - original_memory} bytes")



Memory Usage Analysis:
Memory Usage Before Preprocessing: 647207 bytes
Memory Usage After Preprocessing: 12904128 bytes
Difference in Memory Usage: 12256921 bytes
