In [57]:
# Import libraries
import pandas as pd
import numpy as np
import re
import datetime as dt


In [58]:
# Import data
df = pd.read_csv('supermarket_sales.csv')  
df.tail(1)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
999,849-09-3807,A,Yangon,Member,Female,Fashion accessories,88.34,7,30.919,649.299,2/18/2019,13:28,Cash,618.38,4.761905,30.919,6.6


In [59]:
# Initial Rows and Columns
print(len(df))
df.columns

1000


Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
       'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
       'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
       'Rating'],
      dtype='object')

In [60]:
# Quick overview of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Total                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

In [61]:
df.describe()

Unnamed: 0,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,55.67213,5.51,15.379369,322.966749,307.58738,4.761905,15.379369,6.9727
std,26.494628,2.923431,11.708825,245.885335,234.17651,6.131498e-14,11.708825,1.71858
min,10.08,1.0,0.5085,10.6785,10.17,4.761905,0.5085,4.0
25%,32.875,3.0,5.924875,124.422375,118.4975,4.761905,5.924875,5.5
50%,55.23,5.0,12.088,253.848,241.76,4.761905,12.088,7.0
75%,77.935,8.0,22.44525,471.35025,448.905,4.761905,22.44525,8.5
max,99.96,10.0,49.65,1042.65,993.0,4.761905,49.65,10.0


In [62]:
# Quick look at the different values each column takes
for c in df:
    print(f"Column: {c}")
    print(df[c].value_counts())
    print("\n")  


Column: Invoice ID
750-67-8428    1
642-61-4706    1
816-72-8853    1
491-38-3499    1
322-02-2271    1
              ..
633-09-3463    1
374-17-3652    1
378-07-7001    1
433-75-6987    1
849-09-3807    1
Name: Invoice ID, Length: 1000, dtype: int64


Column: Branch
A    340
B    332
C    328
Name: Branch, dtype: int64


Column: City
Yangon       340
Mandalay     332
Naypyitaw    328
Name: City, dtype: int64


Column: Customer type
Member    501
Normal    499
Name: Customer type, dtype: int64


Column: Gender
Female    501
Male      499
Name: Gender, dtype: int64


Column: Product line
Fashion accessories       178
Food and beverages        174
Electronic accessories    170
Sports and travel         166
Home and lifestyle        160
Health and beauty         152
Name: Product line, dtype: int64


Column: Unit price
83.77    3
39.62    2
24.74    2
19.15    2
73.47    2
        ..
57.95    1
47.65    1
42.82    1
48.09    1
88.34    1
Name: Unit price, Length: 943, dtype: int64


Colum

In [63]:
# Rename columns to python friendly
columns = df.columns

def clean_column_names(columns):
    cleaned_columns = []
    for col in columns:
        col = col.lower()  # Convert to lowercase
        col = col.replace(" ", "_")  # Replace spaces with underscores
        col = col.replace("%", "pct")  # Replace percent symbol with "pct"
        cleaned_columns.append(col)
    return cleaned_columns


df.columns = clean_column_names(columns) 
df.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5pct', 'total', 'date',
       'time', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
       'rating'],
      dtype='object')

In [64]:
# Remove Null values/rows 
df.isnull().sum()

invoice_id                 0
branch                     0
city                       0
customer_type              0
gender                     0
product_line               0
unit_price                 0
quantity                   0
tax_5pct                   0
total                      0
date                       0
time                       0
payment                    0
cogs                       0
gross_margin_percentage    0
gross_income               0
rating                     0
dtype: int64

In [65]:
# Check df before more transformations
df.iloc[0]

invoice_id                       750-67-8428
branch                                     A
city                                  Yangon
customer_type                         Member
gender                                Female
product_line               Health and beauty
unit_price                             74.69
quantity                                   7
tax_5pct                             26.1415
total                               548.9715
date                                1/5/2019
time                                   13:08
payment                              Ewallet
cogs                                  522.83
gross_margin_percentage             4.761905
gross_income                         26.1415
rating                                   9.1
Name: 0, dtype: object

In [66]:
# Check datetime columns
initial_type = df[["date", "time"]].dtypes
print(f"initial dtype is = {initial_type}")

# Parse datetime columns and create new ones for visualization
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.day_name()

# Convert 'date' column back to date type (without time)
df['date'] = df['date'].dt.date

# Function to handle inconsistent time format
def parse_time(time_str):
    try:
        return pd.to_datetime(time_str, format='%H:%M:%S').time()
    except ValueError:
        return pd.to_datetime(time_str, format='%H:%M').time()

# Convert the 'time' column to datetime.time
df['time'] = df['time'].apply(parse_time)

# Create new columns for hour and timeOfDay
df['hour'] = pd.to_datetime(df['time'].astype(str), format='%H:%M:%S').dt.hour
df['timeOfDay'] = pd.cut(df['hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'], right=False)

# Create a field for the total revenue range
df['revenueRange'] = pd.cut(df['total'], bins=[0, 200, 500, 1000, float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])


initial dtype is = date    object
time    object
dtype: object


In [67]:
# Round the values in specific columns to 2 decimal places
df.iloc[:, [6, 8, 9, 13, 15, 15, 16]] = df.iloc[:, [6, 8, 9, 13, 15, 15, 16]].round(2)

# Display the first row of the DataFrame
first_row = df.iloc[0]
first_row

invoice_id                       750-67-8428
branch                                     A
city                                  Yangon
customer_type                         Member
gender                                Female
product_line               Health and beauty
unit_price                             74.69
quantity                                   7
tax_5pct                               26.14
total                                 548.97
date                              2019-01-05
time                                13:08:00
payment                              Ewallet
cogs                                  522.83
gross_margin_percentage             4.761905
gross_income                           26.14
rating                                   9.1
year                                    2019
month                                January
day                                        5
weekday                             Saturday
hour                                      13
timeOfDay 

In [69]:
df.dtypes

invoice_id                         object
branch                             object
city                               object
customer_type                      object
gender                             object
product_line                       object
unit_price                        float64
quantity                            int64
tax_5pct                          float64
total                             float64
date                       datetime64[ns]
time                               object
payment                            object
cogs                              float64
gross_margin_percentage           float64
gross_income                      float64
rating                            float64
year                                int64
month                              object
day                                 int64
weekday                            object
hour                                int64
timeOfDay                        category
revenueRange                     c

In [78]:
# Changing dtypes for better Tableau experience:
df['branch'] = df['branch'].astype('category')
df['city'] = df['city'].astype('category')
df['customer_type'] = df['customer_type'].astype('category')
df['gender'] = df['gender'].astype('category')
df['product_line'] = df['product_line'].astype('category')
df['payment'] = df['payment'].astype('category')
df['weekday'] = df['weekday'].astype('category')
df['month'] = df['month'].astype('category')
df['year'] = df['year'].astype('category')
df['day'] = df['day'].astype('category')
df['hour'] = df['hour'].astype('category')

print(first_row)
df.dtypes

invoice_id                       750-67-8428
branch                                     A
city                                  Yangon
customer_type                         Member
gender                                Female
product_line               Health and beauty
unit_price                             74.69
quantity                                   7
tax_5pct                               26.14
total                                 548.97
date                              2019-01-05
time                                13:08:00
payment                              Ewallet
cogs                                  522.83
gross_margin_percentage             4.761905
gross_income                           26.14
rating                                   9.1
year                                    2019
month                                January
day                                        5
weekday                             Saturday
hour                                      13
timeOfDay 

invoice_id                         object
branch                           category
city                             category
customer_type                    category
gender                           category
product_line                     category
unit_price                        float64
quantity                            int64
tax_5pct                          float64
total                             float64
date                       datetime64[ns]
time                               object
payment                          category
cogs                              float64
gross_margin_percentage           float64
gross_income                      float64
rating                            float64
year                             category
month                            category
day                              category
weekday                          category
hour                             category
timeOfDay                        category
revenueRange                     c

In [82]:
# Replace 'column_name' with the name of the numeric column you want to check
non_numeric_rows = df[pd.to_numeric(df['gross_margin_percentage'], errors='coerce').isnull()]

# Print the non-numeric rows, if any
print(non_numeric_rows)


Empty DataFrame
Columns: [invoice_id, branch, city, customer_type, gender, product_line, unit_price, quantity, tax_5pct, total, date, time, payment, cogs, gross_margin_percentage, gross_income, rating, year, month, day, weekday, hour, timeOfDay, revenueRange]
Index: []

[0 rows x 24 columns]


In [87]:
pd.set_option('display.max_columns', None)
df.month.value_counts()

January     352
March       345
February    303
Name: month, dtype: int64

In [77]:
df.to_csv('clean_data_supermarket.csv', index=False)