# Chapter 27 - Project: Filling missing product names in the sales data

In [2]:
import pandas as pd

In [3]:
sales_df = pd.concat(pd.read_excel('data/Q1Sales.xlsx', sheet_name=None), ignore_index=True)
products_df = pd.read_csv('data/products.csv')

sales_df.head()

Unnamed: 0,InvoiceNo,Channel,Product Name,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total
0,1532,Shoppe.com,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7
2,1534,Bullseye,,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35
3,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76
4,1535,Bullseye,Transformers Age of Extinction Generations Del...,T&G/TRA-20170,Sales,5004,2020-01-01,12/22/19,USD,13.46,6,80.76


In [4]:
products_df.head()

Unnamed: 0,ProductID,Product Name,Brand,Category
0,MI/SNA-81654,"Snark SN-5 Tuner for Guitar, Bass and Violin",Snark,Musical Instruments
1,MI/STU-67796,Studio Microphone Mic Wind Screen Pop Filter/ ...,Generic,Musical Instruments
2,MI/MUS-73312,Musician's Gear Tubular Guitar Stand Black,Musician's Gear,Musical Instruments
3,MI/STR-01505,String Swing CC01K Hardwood Home & Studio Guit...,String Swing,Musical Instruments
4,MI/DUN-82082,"Dunlop 5005 Pick Holder, Each (5005)",Jim Dunlop,Musical Instruments


In [5]:
# Check for missing values
sales_df.isna().sum()

InvoiceNo          0
Channel            0
Product Name    4566
ProductID          0
Account            0
AccountNo          0
Date               0
Deadline           0
Currency           0
Unit Price         0
Quantity           0
Total              0
dtype: int64

In [6]:
# Check for missing values
products_df.isna().sum()

ProductID       0
Product Name    0
Brand           6
Category        0
dtype: int64

In [7]:
# Let's discard 'Brand' and 'Category' columns
products_df = products_df[['ProductID', 'Product Name']]

In [8]:
# Checks if each 'ProductID' in sales_df is also in products_df and returns a boolean Series
sales_df['ProductID'].isin(products_df['ProductID'])

0        True
1        True
2        True
3        True
4        True
         ... 
37703    True
37704    True
37705    True
37706    True
37707    True
Name: ProductID, Length: 37708, dtype: bool

In [9]:
sales_df['ProductID'].isin(products_df['ProductID']).all()

True

In [10]:
sales_df.head(3)

Unnamed: 0,InvoiceNo,Channel,Product Name,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total
0,1532,Shoppe.com,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7
2,1534,Bullseye,,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35


In [11]:
products_df.head(3)

Unnamed: 0,ProductID,Product Name
0,MI/SNA-81654,"Snark SN-5 Tuner for Guitar, Bass and Violin"
1,MI/STU-67796,Studio Microphone Mic Wind Screen Pop Filter/ ...
2,MI/MUS-73312,Musician's Gear Tubular Guitar Stand Black


In [12]:
# Merges the 'sales_df' DataFrame with the 'products_df' DataFrame
sales_df = pd.merge(                                  
                    sales_df,                          # The first DataFrame to merge (sales_df)
                    products_df,                       # The second DataFrame to merge (products_df)
                    on='ProductID',                    # Specifies that the merge should be performed on the 'ProductID' column
                    suffixes=['-Sales', '-Products'],  # Adds suffixes to differentiate columns with the same name from both DataFrames
                    validate='many_to_one'             # Ensures the merge respects the 'many-to-one' relationship
)

sales_df.head(3)

Unnamed: 0,InvoiceNo,Channel,Product Name-Sales,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total,Product Name-Products
0,1532,Shoppe.com,Cannon Water Bomb Balloons 100 Pack,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54,Cannon Water Bomb Balloons 100 Pack
1,1533,Walcart,LEGO Ninja Turtles Stealth Shell in Pursuit 79102,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7,LEGO Ninja Turtles Stealth Shell in Pursuit 79102
2,1534,Bullseye,,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35,Pete the Cat and His Four Groovy Buttons Puppet


To combine the two columns into a single one, let’s define a custom function that choose a valid product name from the two options:

In [14]:
sales_df.isna().sum()

InvoiceNo                   0
Channel                     0
Product Name-Sales       4566
ProductID                   0
Account                     0
AccountNo                   0
Date                        0
Deadline                    0
Currency                    0
Unit Price                  0
Quantity                    0
Total                       0
Product Name-Products       0
dtype: int64

In [15]:
# Defines a function that combines product names based on a condition
def combine_product_names(row):  
    
    if pd.notna(row['Product Name-Sales']):  # Checks if the 'Product Name-Sales' column is not NaN
        return row['Product Name-Sales']     # If not NaN, returns the 'Product Name-Sales' value
        
    else:                                    # If 'Product Name-Sales' is NaN
        return row['Product Name-Products']  # Returns the value from the 'Product Name-Products' column

In [16]:
sales_df.apply(combine_product_names, axis='columns')

0                      Cannon Water Bomb Balloons 100 Pack
1        LEGO Ninja Turtles Stealth Shell in Pursuit 79102
2          Pete the Cat and His Four Groovy Buttons Puppet
3        Transformers Age of Extinction Generations Del...
4        Transformers Age of Extinction Generations Del...
                               ...                        
37703    Nature's Bounty Garlic, 2000mg, Odor-Free, 120...
37704                        Funko Wonder Woman POP Heroes
37705    MONO GS1 GS1-BTY-BLK-L Betty Long Guitar Strap...
37706    Magic: the Gathering - Striking Sliver - Magic...
37707             3 Collapsible Bowl Set 32oz | 16oz | 4oz
Length: 37708, dtype: object

The last step in combining product names is to assign the output
above to a new column in `sales_df` and remove its previous
product name columns:

In [18]:
sales_df['Product Name'] = sales_df.apply(combine_product_names, axis='columns')

sales_df = sales_df.drop(['Product Name-Sales', 'Product Name-Products'], axis='columns')

sales_df.head(3)

Unnamed: 0,InvoiceNo,Channel,ProductID,Account,AccountNo,Date,Deadline,Currency,Unit Price,Quantity,Total,Product Name
0,1532,Shoppe.com,T&G/CAN-97509,Sales,5004,2020-01-01,11/23/19,USD,20.11,14,281.54,Cannon Water Bomb Balloons 100 Pack
1,1533,Walcart,T&G/LEG-37777,Sales,5004,2020-01-01,06/15/20,USD,6.7,1,6.7,LEGO Ninja Turtles Stealth Shell in Pursuit 79102
2,1534,Bullseye,T&G/PET-14209,Sales,5004,2020-01-01,05/07/20,USD,11.67,5,58.35,Pete the Cat and His Four Groovy Buttons Puppet


In [19]:
sales_df.isna().sum()

InvoiceNo       0
Channel         0
ProductID       0
Account         0
AccountNo       0
Date            0
Deadline        0
Currency        0
Unit Price      0
Quantity        0
Total           0
Product Name    0
dtype: int64

In [20]:
# Opens an ExcelWriter object to write to a file
with pd.ExcelWriter('data/Q1SalesClean.xlsx') as outfile:  

    # Loops through unique month names in 'sales_df'
    for month_name in sales_df['Date'].dt.month_name().unique():             

        # Filters the dataframe for the current month
        sheet_df = sales_df[sales_df['Date'].dt.month_name() == month_name]  

        # Writes the filtered dataframe to the Excel file under the corresponding month sheet
        sheet_df.to_excel(outfile, sheet_name=month_name, index=False)       