In [1]:
import pandas as pd
import os

from ucimlrepo import fetch_ucirepo 
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

os.makedirs('data', exist_ok=True)

### **Fetching the Dataset**

In [2]:
# Fetch the Online Retail dataset from UCI Machine Learning Repository
online_retail = fetch_ucirepo(id=352) 

In [3]:
# dataset keys
online_retail.keys()

dict_keys(['data', 'metadata', 'variables'])

In [4]:
# variables information
online_retail.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,InvoiceNo,ID,Categorical,,"a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation",,no
1,StockCode,ID,Categorical,,a 5-digit integral number uniquely assigned to each distinct product,,no
2,Description,Feature,Categorical,,product name,,no
3,Quantity,Feature,Integer,,the quantities of each product (item) per transaction,,no
4,InvoiceDate,Feature,Date,,the day and time when each transaction was generated,,no
5,UnitPrice,Feature,Continuous,,product price per unit,sterling,no
6,CustomerID,Feature,Categorical,,a 5-digit integral number uniquely assigned to each customer,,no
7,Country,Feature,Categorical,,the name of the country where each customer resides,,no


In [5]:
# metadata 
online_retail.metadata

{'uci_id': 352,
 'name': 'Online Retail',
 'repository_url': 'https://archive.ics.uci.edu/dataset/352/online+retail',
 'data_url': 'https://archive.ics.uci.edu/static/public/352/data.csv',
 'abstract': 'This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.',
 'area': 'Business',
 'tasks': ['Classification', 'Clustering'],
 'characteristics': ['Multivariate', 'Sequential', 'Time-Series'],
 'num_instances': 541909,
 'num_features': 6,
 'feature_types': ['Integer', 'Real'],
 'demographics': [],
 'target_col': None,
 'index_col': ['InvoiceNo', 'StockCode'],
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 2015,
 'last_updated': 'Mon Oct 21 2024',
 'dataset_doi': '10.24432/C5BW33',
 'creators': ['Daqing Chen'],
 'intro_paper': {'ID': 361,
  'type': 'NATIVE',
  'title': 'Data mining for the online retail industry: A case study of RFM model

In [6]:
# data keys
online_retail.data.keys()

dict_keys(['ids', 'features', 'targets', 'original', 'headers'])

In [7]:
# column headers
online_retail.data.headers

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [8]:
# original data
df = online_retail.data.original
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


### **Checking Datatypes**

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [10]:
# Convert CustomerID to object type
df = df.astype({'CustomerID': 'object'})
# Convert InvoiceDate to datetime type
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%m/%d/%Y %H:%M')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  object        
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 33.1+ MB


In [11]:
# Statistical summary of numerical columns
df.select_dtypes(include=['int64', 'float64']).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,541909.0,9.55225,218.081158,-80995.0,1.0,3.0,10.0,80995.0
UnitPrice,541909.0,4.611114,96.759853,-11062.06,1.25,2.08,4.13,38970.0


In [12]:
# Statistical summary of categorical columns
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
InvoiceNo,541909.0,25900.0,573585,1114.0
StockCode,541909.0,4070.0,85123A,2313.0
Description,540455.0,4223.0,WHITE HANGING HEART T-LIGHT HOLDER,2369.0
CustomerID,406829.0,4372.0,17841.0,7983.0
Country,541909.0,38.0,United Kingdom,495478.0


### **Cleaning Dataset**

#### *1. Handling Missing Data*

In [13]:
# Count the number of missing values per column
missing_count = df.isnull().sum()

# Calculate the percentage of missing values per column
missing_percentage = round((df.isnull().sum() / len(df)) * 100, 2)

In [14]:
print("Missing Data Count:")
print(missing_count)

print("\nMissing Data Percentage:")
print(missing_percentage)

Missing Data Count:
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

Missing Data Percentage:
InvoiceNo       0.00
StockCode       0.00
Description     0.27
Quantity        0.00
InvoiceDate     0.00
UnitPrice       0.00
CustomerID     24.93
Country         0.00
dtype: float64


**Note:**  
*It is important to have accurate Customer IDs for customer segmentation and recommendation system because clustering is based upon the behavior of each unique customer. Hence, instead of imputing, we decided to remove rows with missing Customer IDs.*

In [15]:
print(f"Number of rows before removing missing CustomerID: {df.shape[0]}")

# Remove rows where CustomerID is missing
df = df.dropna(subset=['CustomerID'])

print(f"Number of rows after removing missing CustomerID: {df.shape[0]}")

Number of rows before removing missing CustomerID: 541909
Number of rows after removing missing CustomerID: 406829


In [16]:
# Verifying the removal of missing values
missing_count_after = df.isnull().sum()
print("Missing Data Count After Removal:")
print(missing_count_after)

Missing Data Count After Removal:
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


#### *2. Handling Duplicates*

In [17]:
# Count the number of duplicate rows
duplicate_count = df.duplicated().sum()

print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 5225


**Note:**  
*These are completely identical rows, including unique identifiers like InvoiceNo, StockCode, and CustomerID and dates like InvoiceDate. Hence, they are most likely be data recording errors that we should remove.*

In [18]:
print(f"Number of rows before removing duplicates: {df.shape[0]}")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

print(f"Number of rows after removing duplicates: {df.shape[0]}")

Number of rows before removing duplicates: 406829
Number of rows after removing duplicates: 401604


#### *3. Checking InvoiceNo*

##### *3.a. Checking Number of Characters in InvoiceNo*

In [19]:
# Count the number of numeric characters in each InvoiceNo
df_play = df.copy()
df_play['InvoiceNo_NumericLength'] = df_play['InvoiceNo'].apply(lambda x: sum(c.isdigit() for c in str(x)))

# Count unique InvoiceNo based on their numeric length
numeric_length_counts = df_play.groupby('InvoiceNo_NumericLength')['InvoiceNo'].nunique()

print("Unique InvoiceNo counts based on numeric length:")
display(numeric_length_counts)

Unique InvoiceNo counts based on numeric length:


InvoiceNo_NumericLength
6    22190
Name: InvoiceNo, dtype: int64

In [20]:
# Count the character length of each InvoiceNo
df_play['InvoiceNo_Length'] = df_play['InvoiceNo'].apply(lambda x: len(str(x)))

# Count unique InvoiceNo based on their character length
invoice_length_counts = df_play.groupby('InvoiceNo_Length')['InvoiceNo'].nunique()

print("Unique InvoiceNo counts based on character length:")
display(invoice_length_counts)

Unique InvoiceNo counts based on character length:


InvoiceNo_Length
6    18536
7     3654
Name: InvoiceNo, dtype: int64

##### *3.b. Handling Cancelled Transactions based on InvoiceNo*

In [21]:
# Add a "Status" column based on the "InvoiceNo" column
df['Status'] = df['InvoiceNo'].apply(lambda x: 'Cancelled' if str(x).startswith('C') else 'Completed')

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,Completed
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,Completed
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed


In [22]:
# Filter data for Completed and Cancelled statuses
df_completed = df[df['Status'] == 'Completed']
df_cancelled = df[df['Status'] == 'Cancelled']

print("Numerical Statistics for Status = Completed:")
display(df_completed.select_dtypes(include=['int64', 'float64']).describe().T)

print("\nNumerical Statistics for Status = Cancelled:")
display(df_cancelled.select_dtypes(include=['int64', 'float64']).describe().T)

Numerical Statistics for Status = Completed:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,392732.0,13.153718,181.58842,1.0,2.0,6.0,12.0,80995.0
UnitPrice,392732.0,3.125596,22.240725,0.0,1.25,1.95,3.75,8142.75



Numerical Statistics for Status = Cancelled:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,8872.0,-30.77491,1172.249902,-80995.0,-6.0,-2.0,-1.0,-1.0
UnitPrice,8872.0,18.899512,445.190864,0.01,1.45,2.95,4.95,38970.0


In [23]:
# Row with maximum Quantity
max_quantity_row = df[df['Quantity'] == df['Quantity'].max()]
print("Row with Maximum Quantity:")
display(max_quantity_row)

# Row with minimum Quantity
min_quantity_row = df[df['Quantity'] == df['Quantity'].min()]
print("\nRow with Minimum Quantity:")
display(min_quantity_row)

Row with Maximum Quantity:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
540421,581483,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,2011-12-09 09:15:00,2.08,16446.0,United Kingdom,Completed



Row with Minimum Quantity:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
540422,C581484,23843,"PAPER CRAFT , LITTLE BIRDIE",-80995,2011-12-09 09:27:00,2.08,16446.0,United Kingdom,Cancelled


In [24]:
# Calculate the percentage of Cancelled transactions
cancelled_percentage = round((len(df_cancelled) / len(df)) * 100,2)

print(f"Percentage of Cancelled Transactions: {cancelled_percentage:.2f}%")

Percentage of Cancelled Transactions: 2.21%


#### *4. Checking StockCode*

##### *4.a. Checking Number of Characters in StockCode*

In [25]:
# Count the number of numeric characters in each StockCode
df_play = df.copy()
df_play['StockCode_NumericLength'] = df_play['StockCode'].apply(lambda x: sum(c.isdigit() for c in str(x)))

# Count unique StockCode based on their numeric length
stockcode_numeric_length_counts = df_play.groupby('StockCode_NumericLength')['StockCode'].nunique()

print("Unique StockCode counts based on numeric length:")
display(stockcode_numeric_length_counts)

Unique StockCode counts based on numeric length:


StockCode_NumericLength
0       7
1       1
5    3676
Name: StockCode, dtype: int64

In [26]:
# Filter rows where StockCode_NumericLength is not equal to 5
filtered_data = df_play[df_play['StockCode_NumericLength'] != 5]

# Group by StockCode and Description
grouped_data = filtered_data.groupby(['StockCode', 'Description']).agg(
    Unique_InvoiceNo_Cancelled=('InvoiceNo', lambda x: x[filtered_data['Status'] == 'Cancelled'].nunique()),
    Unique_InvoiceNo_Completed=('InvoiceNo', lambda x: x[filtered_data['Status'] == 'Completed'].nunique()),
    Quantity_Cancelled=('Quantity', lambda x: x[filtered_data['Status'] == 'Cancelled'].abs().sum()),
    Quantity_Completed=('Quantity', lambda x: x[filtered_data['Status'] == 'Completed'].sum()),
    Price_Cancelled=('UnitPrice', lambda x: (x[filtered_data['Status'] == 'Cancelled'] * filtered_data.loc[filtered_data['Status'] == 'Cancelled', 'Quantity']).sum()),
    Price_Completed=('UnitPrice', lambda x: (x[filtered_data['Status'] == 'Completed'] * filtered_data.loc[filtered_data['Status'] == 'Completed', 'Quantity']).sum())
).reset_index()

# Calculate Net Quantity (Completed - Cancelled)
grouped_data['Net_Quantity'] = grouped_data['Quantity_Completed'] - grouped_data['Quantity_Cancelled']

# Calculate Net Unit Price (Completed - Cancelled)
grouped_data['Net_Price'] = grouped_data['Price_Completed'] + grouped_data['Price_Cancelled']

# Display the grouped data
print("Rows with StockCode_NumericLength not equal to 5 and their aggregated metrics:")
display(grouped_data)

Rows with StockCode_NumericLength not equal to 5 and their aggregated metrics:


Unnamed: 0,StockCode,Description,Unique_InvoiceNo_Cancelled,Unique_InvoiceNo_Completed,Quantity_Cancelled,Quantity_Completed,Price_Cancelled,Price_Completed,Net_Quantity,Net_Price
0,BANK CHARGES,Bank Charges,0,11,0,12,0.0,165.001,12,165.001
1,C2,CARRIAGE,1,133,1,134,-50.0,6686.0,133,6636.0
2,CRUK,CRUK Commission,16,0,16,0,-7933.43,0.0,-16,-7933.43
3,D,Discount,65,0,1194,0,-5696.22,0.0,-1194,-5696.22
4,DOT,DOTCOM POSTAGE,0,16,0,16,0.0,11906.36,16,11906.36
5,M,Manual,154,258,3995,6939,-112165.39,53419.93,2944,-58745.46
6,PADS,PADS TO MATCH ALL CUSHIONS,0,4,0,4,0.0,0.003,4,0.003
7,POST,POSTAGE,95,1099,118,3120,-11093.72,77803.96,3002,66710.24


**Note:**  

- ***BankCharges:** Fees charged by the bank for processing transactions, such as credit card fees or other financial services.*

- ***Carriage:** Shipping or delivery charges for transporting goods to the customer.*

- ***CRUK Commission:** Commission fee related to Cancer Research UK (CRUK), a charity organization. It might be a fee or donation associated with a transaction.*

- ***Discount:** Reduction in the price of goods or services, often applied as a promotion or incentive.*

- ***DOTCOM Postage:** Postage or shipping charges for online (e-commerce) orders, often labeled as "dotcom" to distinguish from in-store purchases.*

- ***Manual:** Manually entered transaction, possibly for adjustments, corrections, or special cases.*

- ***Pads to match all cushions:** Cushion pads or inserts designed to match a specific set of cushions, possibly sold as complementary items.*

- ***Postage:** Cost of shipping or mailing goods to the customer.*

In [27]:
print(f"Number of rows before removing StockCode_NumericLength <> 5: {df.shape[0]}")

# Remove rows where StockCode_NumericLength is not equal to 5
df = df[df['StockCode'].apply(lambda x: sum(c.isdigit() for c in str(x))) == 5]

print(f"Number of rows after removing StockCode_NumericLength <> 5: {df.shape[0]}")

Number of rows before removing StockCode_NumericLength <> 5: 401604


Number of rows after removing StockCode_NumericLength <> 5: 399689


#### *5. Checking Description*

##### *5.a. Handling Descriptions with lower-case letters*

In [28]:
# Filter rows where Description contains lowercase letters
filtered_data = df[df['Description'].str.contains('[a-z]', na=False)]

# Group by Description and StockCode
grouped_data = filtered_data.groupby(['Description', 'StockCode']).agg(
    Unique_InvoiceNo=('InvoiceNo', 'nunique'),
    Quantity_Completed=('Quantity', lambda x: x[df.loc[x.index, 'Status'] == 'Completed'].sum()),
    Quantity_Cancelled=('Quantity', lambda x: x[df.loc[x.index, 'Status'] == 'Cancelled'].abs().sum())
).reset_index()

# Calculate Net Quantity (Completed - Cancelled)
grouped_data['Net_Quantity'] = grouped_data['Quantity_Completed'] - grouped_data['Quantity_Cancelled']

# Display the grouped data
print("Descriptions with lowercase letters and their metrics:")
display(grouped_data)

Descriptions with lowercase letters and their metrics:


Unnamed: 0,Description,StockCode,Unique_InvoiceNo,Quantity_Completed,Quantity_Cancelled,Net_Quantity
0,3 TRADITIONAl BISCUIT CUTTERS SET,22965,206,1154,26,1128
1,BAG 125g SWIRLY MARBLES,21703,248,9272,86,9186
2,BAG 250g SWIRLY MARBLES,21704,200,5004,5,4999
3,BAG 500g SWIRLY MARBLES,21705,113,1377,20,1357
4,ESSENTIAL BALM 3.5g TIN IN ENVELOPE,18007,17,5856,0,5856
5,FLOWERS HANDBAG blue and orange,16151A,3,49,0,49
6,"FOLK ART GREETING CARD,pack/12",84247K,10,22,11,11
7,FRENCH BLUE METAL DOOR SIGN No,22686,75,859,24,835
8,High Resolution Image,23702,3,4,0,4
9,NUMBER TILE COTTAGE GARDEN No,22878,12,52,1,51


**Note:**  

*Remove "High Resolution Image" and "Next Day Carriage"*

In [29]:
print(f"Number of rows before removing specific descriptions: {df.shape[0]}")

# Remove rows with specific descriptions
descriptions_to_remove = ["High Resolution Image", "Next Day Carriage"]
df = df[~df['Description'].isin(descriptions_to_remove)]

print(f"Number of rows after removing specific descriptions: {df.shape[0]}")

Number of rows before removing specific descriptions: 399689
Number of rows after removing specific descriptions: 399606


**Note:**  

*Convert all descriptions to uppercase for consistency*

In [30]:
# Convert all descriptions to uppercase
df['Description'] = df['Description'].str.upper()

In [31]:
# Filter rows where Description contains lowercase letters
filtered_data = df[df['Description'].str.contains('[a-z]', na=False)]

# Group by Description and StockCode
grouped_data = filtered_data.groupby(['Description', 'StockCode']).agg(
    Unique_InvoiceNo=('InvoiceNo', 'nunique'),
    Quantity_Completed=('Quantity', lambda x: x[df.loc[x.index, 'Status'] == 'Completed'].sum()),
    Quantity_Cancelled=('Quantity', lambda x: x[df.loc[x.index, 'Status'] == 'Cancelled'].abs().sum())
).reset_index()

# Calculate Net Quantity (Completed - Cancelled)
grouped_data['Net_Quantity'] = grouped_data['Quantity_Completed'] - grouped_data['Quantity_Cancelled']

# Display the grouped data
print("Descriptions with lowercase letters and their metrics:")
display(grouped_data)

Descriptions with lowercase letters and their metrics:


Unnamed: 0,Description,StockCode,Unique_InvoiceNo,Quantity_Completed,Quantity_Cancelled,Net_Quantity


##### *5.b. Minor Modifications on Punctuations in Description for Consistency*

In [32]:
# Filter rows where Description contains punctuations
punctuation_descriptions = df[df['Description'].str.contains(r'[^\w\s]', na=False)]

# Get unique descriptions with punctuations
unique_punctuation_descriptions = punctuation_descriptions['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with Punctuations:")
print(unique_punctuation_descriptions)

Unique Descriptions with Punctuations:
['WHITE HANGING HEART T-LIGHT HOLDER' 'RED WOOLLY HOTTIE WHITE HEART.'
 'GLASS STAR FROSTED T-LIGHT HOLDER' "POPPY'S PLAYHOUSE BEDROOM "
 "POPPY'S PLAYHOUSE KITCHEN" 'SET/2 RED RETROSPOT TEA TOWELS '
 "PAPER CHAIN KIT 50'S CHRISTMAS " 'VINTAGE BILLBOARD LOVE/HATE MUG'
 'WOOD S/3 CABINET ANT WHITE FINISH' 'RED HANGING HEART T-LIGHT HOLDER'
 'SET/6 RED SPOTTY PAPER PLATES' 'TOMATO CHARLIE+LOLA COASTER SET'
 'CHARLIE & LOLA WASTEPAPER BIN FLORA'
 'RED CHARLIE+LOLA PERSONAL DOORSIGN' 'AIRLINE LOUNGE,METAL SIGN'
 "YOU'RE CONFUSING ME METAL SIGN " 'GIN + TONIC DIET METAL SIGN'
 'VINTAGE SNAKES & LADDERS' 'COLOUR GLASS T-LIGHT HOLDER HANGING'
 'BLACK/BLUE POLKADOT UMBRELLA' 'HANGING HEART ZINC T-LIGHT HOLDER'
 'SET/20 RED RETROSPOT PAPER NAPKINS ' 'SET/6 RED SPOTTY PAPER CUPS'
 'FANCY FONT BIRTHDAY CARD, ' 'S/6 SEW ON CROCHET FLOWERS'
 'LADIES & GENTLEMEN METAL SIGN' "I'M ON HOLIDAY METAL SIGN"
 'SET/10 PINK POLKADOT PARTY CANDLES' 'SET OF 6 T-LIGHTS SNO

**Note:**  

*Ensure only one space between words in the Description*

In [33]:
# Check for rows with multiple spaces in the Description
multiple_spaces = df[df['Description'].str.contains(r'\s{2,}', regex=True)]

# Count occurrences of multiple spaces
print("Occurrences of multiple spaces in the Description:")
print(multiple_spaces.shape[0])

Occurrences of multiple spaces in the Description:
16231


In [34]:
multiple_spaces.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
87,536378,85099C,JUMBO BAG BAROQUE BLACK WHITE,10,2010-12-01 09:37:00,1.95,14688.0,United Kingdom,Completed
128,536381,84832,ZINC WILLIE WINKIE CANDLE STICK,1,2010-12-01 09:41:00,0.85,15311.0,United Kingdom,Completed
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom,Cancelled
176,536386,85099C,JUMBO BAG BAROQUE BLACK WHITE,100,2010-12-01 09:57:00,1.65,16029.0,United Kingdom,Completed
187,536388,21411,GINGHAM HEART DOORSTOP RED,3,2010-12-01 09:59:00,4.25,16250.0,United Kingdom,Completed


In [35]:
# Replace multiple spaces with a single space in the Description
df['Description'] = df['Description'].str.replace(r'\s+', ' ', regex=True)

In [36]:
# Check for rows with multiple spaces in the Description
multiple_spaces = df[df['Description'].str.contains(r'\s{2,}', regex=True)]

# Count occurrences of multiple spaces
print("Occurrences of multiple spaces in the Description:")
print(multiple_spaces.shape[0])

Occurrences of multiple spaces in the Description:
0


**Note:**  

*Remove dot at the end of the Description*

In [37]:
# Check for rows where Description ends with a dot
trailing_dots = df[df['Description'].str.endswith('.')]

# Count occurrences of trailing dots
print("Occurrences of trailing dots in the Description:")
print(trailing_dots.shape[0])

Occurrences of trailing dots in the Description:
1466


In [38]:
trailing_dots.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed
62,536373,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 09:02:00,3.39,17850.0,United Kingdom,Completed
79,536375,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 09:32:00,3.39,17850.0,United Kingdom,Completed
292,536396,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 10:51:00,3.39,17850.0,United Kingdom,Completed
413,536404,20727,LUNCH BAG BLACK SKULL.,10,2010-12-01 11:29:00,1.65,16218.0,United Kingdom,Completed


In [39]:
df['Description'] = df['Description'].str.replace(r'\.$', '', regex=True)

In [40]:
# Check for rows where Description ends with a dot
trailing_dots = df[df['Description'].str.endswith('.')]

# Count occurrences of trailing dots
print("Occurrences of trailing dots in the Description:")
print(trailing_dots.shape[0])

Occurrences of trailing dots in the Description:
0


**Note:**  

*Ensure no space before punctuation*

In [41]:
# Check for rows with spaces before punctuation
spaces_before_punctuation = df[df['Description'].str.contains(r'\s[,!?]', regex=True)]

# Count occurrences of spaces before punctuation
print("Occurrences of spaces before punctuation in the Description:")
print(spaces_before_punctuation.shape[0])

Occurrences of spaces before punctuation in the Description:
930


In [42]:
spaces_before_punctuation.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
1980,C536548,22245,"HOOK, 1 HANGER ,MAGIC GARDEN",-2,2010-12-01 14:33:00,0.85,12472.0,Germany,Cancelled
3370,536623,22245,"HOOK, 1 HANGER ,MAGIC GARDEN",24,2010-12-02 10:39:00,0.85,15601.0,United Kingdom,Completed
4211,536754,21899,"KEY FOB , GARAGE DESIGN",4,2010-12-02 14:09:00,0.65,14449.0,United Kingdom,Completed
4215,536754,21902,"KEY FOB , FRONT DOOR",4,2010-12-02 14:09:00,0.65,14449.0,United Kingdom,Completed
4216,536754,21901,"KEY FOB , BACK DOOR",4,2010-12-02 14:09:00,0.65,14449.0,United Kingdom,Completed


In [43]:
# Ensure no space before punctuation and one space after punctuation
df['Description'] = df['Description'].str.replace(r'\s+([,!?])', r'\1', regex=True)  # Remove space before punctuation

In [44]:
# Check for rows with spaces before punctuation
spaces_before_punctuation = df[df['Description'].str.contains(r'\s[,!?]', regex=True)]

# Count occurrences of spaces before punctuation
print("Occurrences of spaces before punctuation in the Description:")
print(spaces_before_punctuation.shape[0])

Occurrences of spaces before punctuation in the Description:
0


**Note:**  

*Ensure one space after punctuation*

In [45]:
# Check for rows with missing space after punctuation
missing_space_after_punctuation = df[df['Description'].str.contains(r'[,!?][^\s]', regex=True)]

# Count occurrences of missing space after punctuation
print("Occurrences of missing space after punctuation in the Description:")
print(missing_space_after_punctuation.shape[0])

Occurrences of missing space after punctuation in the Description:
1357


In [46]:
missing_space_after_punctuation.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
109,536381,82567,"AIRLINE LOUNGE,METAL SIGN",2,2010-12-01 09:41:00,2.1,15311.0,United Kingdom,Completed
1980,C536548,22245,"HOOK, 1 HANGER,MAGIC GARDEN",-2,2010-12-01 14:33:00,0.85,12472.0,Germany,Cancelled
2104,536559,51014C,"FEATHER PEN,COAL BLACK",24,2010-12-01 14:54:00,0.85,17873.0,United Kingdom,Completed
2105,536559,51014L,"FEATHER PEN,LIGHT PINK",12,2010-12-01 14:54:00,0.85,17873.0,United Kingdom,Completed
2107,536559,51014C,"FEATHER PEN,COAL BLACK",12,2010-12-01 14:54:00,0.85,17873.0,United Kingdom,Completed


In [47]:
df['Description'] = df['Description'].str.replace(r'([,!?])\s*', r'\1 ', regex=True)  # Ensure one space after punctuation

In [48]:
# Check for rows with missing space after punctuation
missing_space_after_punctuation = df[df['Description'].str.contains(r'[,!?][^\s]', regex=True)]

# Count occurrences of missing space after punctuation
print("Occurrences of missing space after punctuation in the Description:")
print(missing_space_after_punctuation.shape[0])

Occurrences of missing space after punctuation in the Description:
0


**Note:**  

*Remove Leading and Trailing Whitespaces*

In [49]:
# Check for rows with leading or trailing white spaces
leading_trailing_spaces = df[df['Description'].str.match(r'^\s|\s$', na=False)]

# Count occurrences of leading or trailing white spaces
print("Occurrences of leading or trailing white spaces in the Description:")
print(leading_trailing_spaces.shape[0])

Occurrences of leading or trailing white spaces in the Description:
933


In [50]:
leading_trailing_spaces.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
40,536370,22900,SET 2 TEA TOWELS I LOVE LONDON,24,2010-12-01 08:45:00,2.95,12583.0,France,Completed
521,536409,22900,SET 2 TEA TOWELS I LOVE LONDON,1,2010-12-01 11:45:00,2.95,17908.0,United Kingdom,Completed
611,536412,22900,SET 2 TEA TOWELS I LOVE LONDON,2,2010-12-01 11:49:00,2.95,17920.0,United Kingdom,Completed
641,536415,22900,SET 2 TEA TOWELS I LOVE LONDON,3,2010-12-01 11:57:00,2.95,12838.0,United Kingdom,Completed
927,536500,22900,SET 2 TEA TOWELS I LOVE LONDON,7,2010-12-01 12:35:00,2.95,17377.0,United Kingdom,Completed


In [51]:
# Remove leading and trailing white spaces
df['Description'] = df['Description'].str.strip()

In [52]:
# Check for rows with leading or trailing white spaces
leading_trailing_spaces = df[df['Description'].str.match(r'^\s|\s$', na=False)]

# Count occurrences of leading or trailing white spaces
print("Occurrences of leading or trailing white spaces in the Description:")
print(leading_trailing_spaces.shape[0])

Occurrences of leading or trailing white spaces in the Description:
0


**Note:**  

*Changing "+", "&" to "AND"*

In [53]:
# Filter rows where Description contains "+"
descriptions_with_symbols = df[df['Description'].str.contains(r'[+]', na=False)]

# Get unique descriptions with "+"
unique_descriptions_with_symbols = descriptions_with_symbols['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with '+':")
print(unique_descriptions_with_symbols)

Unique Descriptions with '+':
['TOMATO CHARLIE+LOLA COASTER SET' 'RED CHARLIE+LOLA PERSONAL DOORSIGN'
 'GIN + TONIC DIET METAL SIGN' 'REX CASH+CARRY JUMBO SHOPPER'
 'CHARLIE + LOLA RED HOT WATER BOTTLE' 'RETROSPOT PARTY BAG + STICKER SET'
 '200 RED + WHITE BENDY STRAWS' 'OFFICE MUG WARMER BLACK+SILVER'
 'BLUE CHARLIE+LOLA PERSONAL DOORSIGN' 'CHARLIE+LOLA"EXTREMELY BUSY" SIGN'
 'PAINTED LIGHTBULB STAR+ MOON' 'CERAMIC CAKE STAND + HANGING CAKES'
 'OFFICE MUG WARMER CHOC+BLUE' 'SMALL HANGING GLASS+ZINC LANTERN'
 'CHARLIE+LOLA PINK HOT WATER BOTTLE' 'CHARLIE+LOLA RED HOT WATER BOTTLE'
 'RIBBON REEL FLORA + FAUNA' 'LADYBIRD + BEE RAFFIA FOOD COVER'
 'CHARLIE+LOLA MY ROOM DOOR SIGN' 'COFFEE MUG CAT + BIRD DESIGN'
 'CHARLIE + LOLA BISCUITS TINS' 'SKULLS PARTY BAG + STICKER SET'
 'CARROT CHARLIE+LOLA COASTER SET' 'BLACK+WHITE NECKLACE W TASSEL'
 'AMBER CHUNKY GLASS+BEAD NECKLACE' 'CERAMIC CAKE BOWL + HANGING CAKES'
 'DINOSAUR PARTY BAG + STICKER SET' 'COFFEE MUG DOG + BALL DESIGN'
 'NECKLACE+B

In [54]:
# Count occurrences of unique descriptions with "+"
print("Count of Unique Descriptions with '+':")
print(len(unique_descriptions_with_symbols))

Count of Unique Descriptions with '+':
52


In [55]:
# Replace " + " with " AND "
df['Description'] = df['Description'].str.replace(r'\s\+\s', ' AND ', regex=True)
# Replace "+" with " AND "
df['Description'] = df['Description'].str.replace(r'\+', ' AND ', regex=True)
# Replace " +" with " AND "
df['Description'] = df['Description'].str.replace(r'\s\+', ' AND ', regex=True)
# Replace "+ " with " AND "
df['Description'] = df['Description'].str.replace(r'\+\s', ' AND ', regex=True)

In [56]:
# Filter rows where Description contains "+"
descriptions_with_symbols = df[df['Description'].str.contains(r'[+]', na=False)]

# Get unique descriptions with "+"
unique_descriptions_with_symbols = descriptions_with_symbols['Description'].unique()

# Count occurrences of unique descriptions with "+"
print("Count of Unique Descriptions with '+':")
print(len(unique_descriptions_with_symbols))

Count of Unique Descriptions with '+':
0


In [57]:
# Filter rows where Description contains "&"
descriptions_with_symbols = df[df['Description'].str.contains(r'[&]', na=False)]

# Get unique descriptions with "&"
unique_descriptions_with_symbols = descriptions_with_symbols['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with '&':")
print(unique_descriptions_with_symbols)

Unique Descriptions with '&':
['CHARLIE & LOLA WASTEPAPER BIN FLORA' 'VINTAGE SNAKES & LADDERS'
 'LADIES & GENTLEMEN METAL SIGN' 'PINK & WHITE BREAKFAST TRAY'
 'GIN & TONIC DIET GREETING CARD' 'BOOZE & WOMEN GREETING CARD'
 'LARGE HANGING IVORY & RED WOOD BIRD' 'CINAMMON & ORANGE WREATH'
 'EUCALYPTUS & PINECONE WREATH' 'TEATIME PEN CASE & PENS'
 'PIG KEYRING WITH LIGHT & SOUND' 'BLUE & WHITE BREAKFAST TRAY'
 'FOLDING CAMPING SCISSOR W/KNIF & S' 'CHARLIE & LOLA WASTEPAPER BIN BLUE'
 'PEARL & SHELL 42"NECKL. GREEN' 'POLKADOT COFFEE CUP & SAUCER PINK'
 'BOX/12 CHICK & EGG IN BASKET' 'HEN HOUSE W FAMILY IN BARN & NEST'
 'PINK CHERRY BLOSSOM CUP & SAUCER' 'VINTAGE ENAMEL & CRYSTAL EARRINGS'
 'PINK & WHITE ROSEBUD RING' 'PINK ROSEBUD & PEARL NECKLACE'
 'WHITE ROSEBUD & PEARL NECKLACE' 'PEARL & SHELL 42"NECKL. IVORY'
 'VINTAGE ENAMEL & CRYSTAL NECKLACE' 'PLAYING CARDS KEEP CALM & CARRY ON'
 'HORSE & PONY WALL ART' 'WALL ART HORSE & PONY'
 'DROP EARRINGS W FLOWER & LEAF' 'TRADITIONAL NAUGHTS &

In [58]:
# Count occurrences of unique descriptions with "&"
print("Count of Unique Descriptions with '&':")
print(len(unique_descriptions_with_symbols))

Count of Unique Descriptions with '&':
30


In [59]:
# Replace " & " with " AND "
df['Description'] = df['Description'].str.replace(r'\s&\s', ' AND ', regex=True)

In [60]:
# Filter rows where Description contains "&"
descriptions_with_symbols = df[df['Description'].str.contains(r'[&]', na=False)]

# Get unique descriptions with "&"
unique_descriptions_with_symbols = descriptions_with_symbols['Description'].unique()

# Count occurrences of unique descriptions with "&"
print("Count of Unique Descriptions with '&':")
print(len(unique_descriptions_with_symbols))

Count of Unique Descriptions with '&':
0


**Note:**  

*Changing "W" to "WITH"*

In [61]:
# Filter rows where Description contains " W "
descriptions_with_w = df[df['Description'].str.contains(r'\sW\s', na=False)]

# Get unique descriptions with " W "
unique_descriptions_with_w = descriptions_with_w['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with ' W ':")
print(unique_descriptions_with_w)

Unique Descriptions with ' W ':
['SET 3 WICKER OVAL BASKETS W LIDS' "PINK B'FLY C/COVER W BOBBLES"
 'BLACK AND WHITE NECKLACE W TASSEL' 'BLUE/GREEN SHELL NECKLACE W PENDANT'
 'FRENCH ENAMEL POT W LID' 'NEW ENGLAND MUG W GIFT BOX'
 'BLACK FINE BEAD NECKLACE W TASSEL' 'HAND TOWEL PALE BLUE W FLOWERS'
 'BLUE ORGANDY ROUND LAMPSHADE W BEA'
 'AMETHYST CHUNKY BEAD BRACELET W STR'
 'AMBER CHUNKY BEAD BRACELET W STRAP' 'HEN HOUSE W CHICK STANDING'
 'FINE SILVER NECKLACE W PASTEL FLOWE' 'AMBER FINE BEAD NECKLACE W TASSEL'
 'LAZER CUT NECKLACE W PASTEL BEADS' 'PINK SCOTTIE DOG W FLOWER PATTERN'
 'BLUE SCOTTIE DOG W FLOWER PATTERN' 'BLACK CHUNKY BEAD BRACELET W STRAP'
 'RESIN BRACELET W PASTEL BEADS' 'RESIN NECKLACE W PASTEL BEADS'
 'HEN HOUSE W FAMILY IN BARN AND NEST'
 'BLACK GLASS BRACELET W HEART CHARMS' 'ELEPHANT CLIP W SUCTION CUP'
 'PURPLE AMETHYST NECKLACE W TASSEL' 'BLUE/NAT SHELL NECKLACE W PENDANT'
 'SILVER/NAT SHELL NECKLACE W PENDANT' 'JADE DROP EARRINGS W FILIGREE'
 'SILVER BRACELET

In [62]:
# Count occurrences of unique descriptions with " W "
print("Count of Unique Descriptions with ' W ':")
print(len(unique_descriptions_with_w))

Count of Unique Descriptions with ' W ':
41


In [63]:
# Replace " W " with " WITH "
df['Description'] = df['Description'].str.replace(r'\sW\s', ' WITH ', regex=True)

In [64]:
# Filter rows where Description contains " W "
descriptions_with_w = df[df['Description'].str.contains(r'\sW\s', na=False)]

# Get unique descriptions with " W "
unique_descriptions_with_w = descriptions_with_w['Description'].unique()

# Count occurrences of unique descriptions with " W "
print("Count of Unique Descriptions with ' W ':")
print(len(unique_descriptions_with_w))

Count of Unique Descriptions with ' W ':
0


**Note:**  

*Changing "/" to "OR"*

In [65]:
# Filter rows where Description contains "/" but not followed by a number
descriptions_with_slash = df[df['Description'].str.contains(r'/\D', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with '/' not followed by a number:")
print(unique_descriptions_with_slash)

Unique Descriptions with '/' not followed by a number:
['VINTAGE BILLBOARD LOVE/HATE MUG' 'BLACK/BLUE POLKADOT UMBRELLA'
 "PINK B'FLY C/COVER WITH BOBBLES" 'FOLDING UMBRELLA WHITE/RED POLKADOT'
 'FOLDING UMBRELLA RED/WHITE POLKADOT' 'PACK 3 FIRE ENGINE/CAR PATCHES'
 'SMALL HANGING IVORY/RED WOOD BIRD' 'PINK/WHITE CHRISTMAS TREE 60CM'
 'PINK/YELLOW FLOWERS HANDBAG' 'FOOD/DRINK SPONGE STICKERS'
 'BLUE/CREAM STRIPE CUSHION COVER' 'PINK/GREEN FLOWER DESIGN BIG MUG'
 'BLUE/YELLOW FLOWER DESIGN BIG MUG' 'GREEN/BLUE FLOWER DESIGN BIG MUG'
 'YELLOW/PINK FLOWER DESIGN BIG MUG'
 'BLUE/GREEN SHELL NECKLACE WITH PENDANT' 'SILVER/NATURAL SHELL NECKLACE'
 'PINK/WHITE RIBBED MELAMINE JUG' 'PACK 6 HEART/ICE-CREAM PATCHES'
 'RED ROSE AND LACE C/COVER' 'FOLDING CAMPING SCISSOR W/KNIF AND S'
 'PINK/WHITE "KEEP CLEAN" BULLET BIN' 'FLOWER PURPLE CLOCK W/SUCKER'
 'ORIGAMI ROSE INCENSE/CANDLE SET' 'ORIGAMI JASMINE INCENSE/CANDLE SET'
 'ORIGAMI OPIUM INCENSE/CANDLE SET' 'ORIGAMI SANDLEWOOD INCENSE/CAND SET'
 

In [66]:
# Count occurrences of unique descriptions with "/"
print("Count of Unique Descriptions with '/' not followed by a number:")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with '/' not followed by a number:
80


In [67]:
# Replace "/" with " OR " where "/" is not followed by a number
df['Description'] = df['Description'].str.replace(r'/(?=\D)', ' OR ', regex=True)

In [68]:
# Filter rows where Description contains "/" but not followed by a number
descriptions_with_slash = df[df['Description'].str.contains(r'/\D', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Count occurrences of unique descriptions with "/"
print("Count of Unique Descriptions with '/' not followed by a number:")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with '/' not followed by a number:
0


**Note:**  

*Changing "S/" to "SET "*

In [69]:
# Filter rows where Description contains "/"
descriptions_with_slash = df[df['Description'].str.contains(r'S/', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with '/':")
print(unique_descriptions_with_slash)

Unique Descriptions with '/':
['WOOD S/3 CABINET ANT WHITE FINISH' 'S/6 SEW ON CROCHET FLOWERS'
 'S/4 VALENTINE DECOUPAGE HEART BOX' 'S/6 WOODEN SKITTLES IN COTTON BAG'
 'S/15 SILVER GLASS BAUBLES IN BAG' 'S/4 CACTI CANDLES'
 'YULETIDE IMAGES S/6 PAPER BOXES' 'S/12 MINI RABBIT EASTER'
 'S/12 VANILLA BOTANICAL T-LIGHTS' 'S/3 POT POURI CUSHIONS BLUE COLOURS'
 'S/4 BLACK MINI ROSE CANDLE IN BOWL' 'S/4 IVORY MINI ROSE CANDLE IN BOWL'
 'S/4 PINK FLOWER CANDLES IN BOWL' 'S/4 GROOVY CAT MAGNETS'
 'S/2 ZINC HEART DESIGN PLANTERS' 'S/3 PINK SQUARE PLANTERS ROSES'
 'S/4 ICON COASTER, ELVIS LIVES' 'S/2 BEACH HUT TREASURE CHESTS']


In [70]:
# Count occurrences of unique descriptions with "S/"
print("Count of Unique Descriptions with 'S/':")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with 'S/':
18


In [71]:
# Replace " S/ " with " SET "
df['Description'] = df['Description'].str.replace(r'S/', 'SET OF ', regex=True)

In [72]:
# Filter rows where Description contains "/"
descriptions_with_slash = df[df['Description'].str.contains(r'S/', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Count occurrences of unique descriptions with "S/"
print("Count of Unique Descriptions with 'S/':")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with 'S/':
0


**Note:**  

*Changing "/" to space*

In [73]:
# Filter rows where Description contains "/"
descriptions_with_slash = df[df['Description'].str.contains(r'/', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with '/':")
print(unique_descriptions_with_slash)

Unique Descriptions with '/':
['SET/2 RED RETROSPOT TEA TOWELS' 'SET/6 RED SPOTTY PAPER PLATES'
 'SET/20 RED RETROSPOT PAPER NAPKINS' 'SET/6 RED SPOTTY PAPER CUPS'
 'SET/10 PINK POLKADOT PARTY CANDLES'
 'SET/5 RED RETROSPOT LID GLASS BOWLS' 'SET/10 RED POLKADOT PARTY CANDLES'
 'DIAMANTE HAIR GRIP PACK/2 BLACK DIA' 'DIAMANTE HAIR GRIP PACK/2 RUBY'
 'DIAMANTE HAIR GRIP PACK/2 MONTANA' 'SET/10 BLUE POLKADOT PARTY CANDLES'
 'SET/9 CHRISTMAS T-LIGHTS SCENTED' 'ORANGE SCENTED SET/9 T-LIGHTS'
 'SET/10 IVORY POLKADOT PARTY CANDLES' 'SET/12 TAPER CANDLES'
 'SET/4 MODERN VINTAGE COTTON NAPKINS' 'SET/3 CHRISTMAS DECOUPAGE CANDLES'
 'SET/3 VANILLA SCENTED CANDLE IN BOX' 'SET/4 GARDEN ROSE DINNER CANDLE'
 'SET/4 BADGES CUTE CREATURES' 'SET/3 RED GINGHAM ROSE STORAGE BOX'
 'SET/6 BEAD COASTERS GAUZE BAG GOLD' 'SET/20 STRAWBERRY PAPER NAPKINS'
 'SET/3 FLORAL GARDEN TOOLS IN BAG' 'PACK/12 BLUE FOLKART CARDS'
 'FOLK ART GREETING CARD, PACK/12' 'SET/3 OCEAN SCENT CANDLE JEWEL BOX'
 'SET/4 SKULL BADGES' 

In [74]:
# Count occurrences of unique descriptions with "/"
print("Count of Unique Descriptions with '/':")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with '/':
67


In [75]:
# Replace "/" with " "
df['Description'] = df['Description'].str.replace(r'/', ' OF ', regex=True)

In [76]:
# Filter rows where Description contains "/"
descriptions_with_slash = df[df['Description'].str.contains(r'/', na=False)]

# Get unique descriptions with "/"
unique_descriptions_with_slash = descriptions_with_slash['Description'].unique()

# Count occurrences of unique descriptions with "/"
print("Count of Unique Descriptions with '/':")
print(len(unique_descriptions_with_slash))

Count of Unique Descriptions with '/':
0


In [77]:
# Filter rows where Description contains punctuations
punctuation_descriptions = df[df['Description'].str.contains(r'[^\w\s]', na=False)]

# Get unique descriptions with punctuations
unique_punctuation_descriptions = punctuation_descriptions['Description'].unique()

# Display the unique descriptions
print("Unique Descriptions with Punctuations:")
print(unique_punctuation_descriptions)

Unique Descriptions with Punctuations:
['WHITE HANGING HEART T-LIGHT HOLDER' 'GLASS STAR FROSTED T-LIGHT HOLDER'
 "POPPY'S PLAYHOUSE BEDROOM" "POPPY'S PLAYHOUSE KITCHEN"
 "PAPER CHAIN KIT 50'S CHRISTMAS" 'RED HANGING HEART T-LIGHT HOLDER'
 'AIRLINE LOUNGE, METAL SIGN' "YOU'RE CONFUSING ME METAL SIGN"
 'COLOUR GLASS T-LIGHT HOLDER HANGING' 'HANGING HEART ZINC T-LIGHT HOLDER'
 'FANCY FONT BIRTHDAY CARD,' "I'M ON HOLIDAY METAL SIGN"
 'SET OF 6 T-LIGHTS SNOWMEN' 'SET OF 6 T-LIGHTS SANTA'
 'VICTORIAN GLASS HANGING T-LIGHT' 'SINGLE HEART ZINC T-LIGHT HOLDER'
 'SILVER HANGING T-LIGHT HOLDER' 'ROTATING SILVER ANGELS T-LIGHT HLDR'
 "CHILDREN'S SPACEBOY MUG" 'BLACK CANDELABRA T-LIGHT HOLDER'
 'SET 12 LAVENDER BOTANICAL T-LIGHTS' "PINK B'FLY C OR COVER WITH BOBBLES"
 'AGED GLASS SILVER T-LIGHT HOLDER' "CHILDREN'S APRON DOLLY GIRL"
 'SET OF 6 T-LIGHTS TOADSTOOLS' 'ROTATING LEAVES T-LIGHT HOLDER'
 'HYACINTH BULB T-LIGHT CANDLES' 'CHRISTMAS TREE T-LIGHT HOLDER'
 'STAR T-LIGHT HOLDER' 'HEART T-LIGHT 

#### *6. Checking UnitPrice*

In [78]:
# Filter rows where UnitPrice is 0
unitprice_zero_rows = df[df['UnitPrice'] == 0]

# Display the filtered rows
unitprice_zero_rows.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
9302,537197,22841,ROUND CAKE TIN VINTAGE GREEN,1,2010-12-05 14:02:00,0.0,12647.0,Germany,Completed
33576,539263,22580,ADVENT CALENDAR GINGHAM SACK,4,2010-12-16 14:36:00,0.0,16560.0,United Kingdom,Completed
40089,539722,22423,REGENCY CAKESTAND 3 TIER,10,2010-12-21 13:45:00,0.0,14911.0,EIRE,Completed
47068,540372,22090,PAPER BUNTING RETROSPOT,24,2011-01-06 16:41:00,0.0,13081.0,United Kingdom,Completed
47070,540372,22553,PLASTERS IN TIN SKULLS,24,2011-01-06 16:41:00,0.0,13081.0,United Kingdom,Completed


In [79]:
print(f"Number of rows before removing UnitPrice = 0: {df.shape[0]}")

# Remove rows where UnitPrice is 0
df = df[df['UnitPrice'] != 0]

# Verify the changes
print(f"Number of rows after removing UnitPrice = 0: {df.shape[0]}")

Number of rows before removing UnitPrice = 0: 399606
Number of rows after removing UnitPrice = 0: 399573


In [80]:
# Filter rows where UnitPrice is 0
unitprice_lessthan_zero_rows = df[df['UnitPrice'] < 0]

# Display the filtered rows
unitprice_lessthan_zero_rows.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status


#### *7. Checking CustomerID*

In [81]:
df_play['CustomerID'] = df_play['CustomerID'].astype(str)

# Count the number of numeric characters in each CustomerID
df_play['CustomerID_NumericLength'] = df_play['CustomerID'].apply(lambda x: sum(c.isdigit() for c in x))

# Count unique CustomerID based on their numeric length
numeric_length_counts = df_play.groupby('CustomerID_NumericLength')['CustomerID'].nunique()

print("Unique CustomerID counts based on numeric length:")
display(numeric_length_counts)

Unique CustomerID counts based on numeric length:


CustomerID_NumericLength
6    4372
Name: CustomerID, dtype: int64

In [82]:
# Count CustomerID with alphabetic characters
customerid_with_alphabets = df_play[df_play['CustomerID'].str.contains('[a-zA-Z]', na=False)]
alphabet_count = customerid_with_alphabets['CustomerID'].nunique()

print(f"Count of CustomerID with alphabetic characters: {alphabet_count}")

Count of CustomerID with alphabetic characters: 0


#### *8. Checking Country*

In [83]:
# Display unique countries and their value counts
country_counts = df['Country'].value_counts()

print("Unique countries and their value counts:")
display(country_counts)

Unique countries and their value counts:


Country
United Kingdom          356008
Germany                   9079
France                    8152
EIRE                      7368
Spain                     2462
Netherlands               2326
Belgium                   1971
Switzerland               1843
Portugal                  1427
Australia                 1253
Norway                    1059
Italy                      783
Channel Islands            752
Finland                    653
Cyprus                     608
Sweden                     436
Austria                    387
Denmark                    375
Japan                      355
Poland                     336
USA                        291
Israel                     247
Unspecified                241
Singapore                  215
Iceland                    182
Canada                     150
Greece                     142
Malta                      123
United Arab Emirates        67
European Community          58
RSA                         57
Lebanon                     45


### **Final Dataset**

In [84]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Status
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,Completed
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,Completed
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Completed


In [85]:
# Save the dataset as a CSV file
df.to_csv('data/prepared_data.csv', index=False)