In [65]:
import pandas as pd

In [66]:
main_df = pd.read_csv('Dataset/data.csv', encoding= "ISO-8859-1")
main_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [67]:
# Number of rows and columns of dataframe
main_df.shape

(541909, 8)

In [68]:
# Number of null values in each column
main_df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [69]:
# Since the column "Description" and "CustomerID" has null value.
# 1. Description: we will delete the null value rows from the description which will make it difficult to analyze products.
# 2. CustomerID: we will delete null values because we are unable to segment null value to customer.

main_df.dropna(axis = 0, subset = ['Description', 'CustomerID'], inplace = True)

In [70]:
# Checking for null values in each column again. Great!
main_df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [71]:
# Checking duplicates and dropping them so that we don't misrepresent the customer into different segments.
print("Number of duplicates: ", main_df.duplicated().sum())

main_df.drop_duplicates(inplace=True)

print("Verifying number of duplicates after removing them : ", main_df.duplicated().sum())

Number of duplicates:  5225
Verifying number of duplicates after removing them :  0


In [72]:
# Data type of each column
main_df.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object

In [73]:
# assigning "InvoiceDate" date type date-time.
main_df.InvoiceDate = pd.to_datetime(main_df.InvoiceDate)

# verifying datetime object
main_df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [74]:
# Check total number of unique Country, Customer and Product
print('Total number of Countries   : ', len(main_df.Country.unique()))
print('Total number of CustomerID  : ', len(main_df.CustomerID.unique()))
print('Total number of Purchases   : ', len(main_df.InvoiceNo.unique()))
print('Total number of StockID     : ', len(main_df.StockCode.unique()))

Total number of Countries   :  37
Total number of CustomerID  :  4372
Total number of Purchases   :  22190
Total number of StockID     :  3684


In [75]:
# Create column CancledOrder, value of CancledOrder is 1 if InvoiceNo starts with 'C' else CancledOrder is 0
# we can also write a code to solve similar problem based on negative quantity

cancled_order = []

for InvoiceNo in main_df.InvoiceNo:
    if InvoiceNo.startswith('C'):
        cancled_order.append(1)
    else:
        cancled_order.append(0)

main_df['CancledOrder'] = cancled_order

In [76]:
# Here, we can see that the Quantity is negative
main_df.loc[main_df.CancledOrder == 1, :].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom,1
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom,1
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom,1
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom,1
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom,1


In [77]:
df_cleaned = main_df.copy(deep = True)
df_cleaned['QuantityCanceled'] = 0

entry_to_remove = [] ; doubtfull_entry = []

for index, col in  main_df.iterrows():
    if (col['Quantity'] > 0) or col['Description'] == 'Discount': continue        
    df_test = main_df[(main_df['CustomerID'] == col['CustomerID']) &
                         (main_df['StockCode']  == col['StockCode']) & 
                         (main_df['InvoiceDate'] < col['InvoiceDate']) & 
                         (main_df['Quantity']   > 0)].copy()
    #_________________________________
    # Cancelation WITHOUT counterpart
    if (df_test.shape[0] == 0): 
        doubtfull_entry.append(index)
    #________________________________
    # Cancelation WITH a counterpart
    elif (df_test.shape[0] == 1): 
        index_order = df_test.index[0]
        df_cleaned.loc[index_order, 'QuantityCanceled'] = -col['Quantity']
        entry_to_remove.append(index)        
    #______________________________________________________________
    # Various counterparts exist in orders: we delete the last one
    elif (df_test.shape[0] > 1): 
        df_test.sort_index(axis=0 ,ascending=False, inplace = True)        
        for ind, val in df_test.iterrows():
            if val['Quantity'] < -col['Quantity']: continue
            df_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index) 
            break
            
# Credits to FABIENDANIEL. Source Kaggle

In [78]:
print("entry_to_remove: {}".format(len(entry_to_remove)))
print("doubtfull_entry: {}".format(len(doubtfull_entry)))

# Credits to FABIENDANIEL. Source Kaggle

entry_to_remove: 7521
doubtfull_entry: 1226


In [79]:
df_cleaned.drop(entry_to_remove, axis = 0, inplace = True)
df_cleaned.drop(doubtfull_entry, axis = 0, inplace = True)
remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]
print("nb of entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]

# Credits to FABIENDANIEL. Source Kaggle

nb of entries to delete: 48


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder,QuantityCanceled
77598,C542742,84535B,FAIRY CAKES NOTEBOOK A6 SIZE,-94,2011-01-31 16:26:00,0.65,15358.0,United Kingdom,1,0
90444,C544038,22784,LANTERN CREAM GAZEBO,-4,2011-02-15 11:32:00,4.95,14659.0,United Kingdom,1,0
111968,C545852,22464,HANGING METAL HEART LANTERN,-5,2011-03-07 13:49:00,1.65,14048.0,United Kingdom,1,0
116064,C546191,47566B,TEA TIME PARTY BUNTING,-35,2011-03-10 10:57:00,0.7,16422.0,United Kingdom,1,0
132642,C547675,22263,FELT EGG COSY LADYBIRD,-49,2011-03-24 14:07:00,0.66,17754.0,United Kingdom,1,0


In [80]:
df_cleaned.CustomerID = df_cleaned.CustomerID.astype('int64')
df_cleaned.reset_index(drop=True, inplace=True)

In [81]:
# Saving files as excel named Cleaned_data.xlsx

# df_cleaned.to_excel('Dataset/Cleaned_data.xlsx')

In [84]:
df_cleaned.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder,QuantityCanceled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,0,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,0,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,0,0


# RFM analysis

In [86]:
RFM_df = df_cleaned.copy()

In [88]:
# Updating the dates to make data as new as possible.
# This is nothing to do with the analysis, it only gives makes the data more recent and feel more realistic
from datetime import timedelta, date

# We will be subtracting every date from max date + 1. so that we have recent date.
max_date = max(RFM_df.InvoiceDate + timedelta(days=1))

# we will be subtracting difference_of_date from today date
difference_of_date = (max_date - RFM_df.InvoiceDate)

# saving the new dates and removing previous dates
RFM_df.InvoiceDate = pd.to_datetime(date.today()) - difference_of_date

RFM_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder,QuantityCanceled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2021-04-24 19:36:00,2.55,17850,United Kingdom,0,0
1,536365,71053,WHITE METAL LANTERN,6,2021-04-24 19:36:00,3.39,17850,United Kingdom,0,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2021-04-24 19:36:00,2.75,17850,United Kingdom,0,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2021-04-24 19:36:00,3.39,17850,United Kingdom,0,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2021-04-24 19:36:00,3.39,17850,United Kingdom,0,0


In [92]:
# calculating recency and R-score

customers = RFM_df.CustomerID.unique()

RFM_df.loc[(RFM_df.CustomerID == 15358) , :]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder,QuantityCanceled
12537,537888,84535B,FAIRY CAKES NOTEBOOK A6 SIZE,16,2021-05-02 21:14:00,0.65,15358,United Kingdom,0,0
12538,537888,21259,VICTORIAN SEWING BOX SMALL,2,2021-05-02 21:14:00,5.95,15358,United Kingdom,0,0
12539,537888,85049E,SCANDINAVIAN REDS RIBBONS,12,2021-05-02 21:14:00,1.25,15358,United Kingdom,0,0
12540,537888,21976,PACK OF 60 MUSHROOM CAKE CASES,48,2021-05-02 21:14:00,0.55,15358,United Kingdom,0,0
12541,537888,22398,MAGNETS PACK OF 4 SWALLOWS,24,2021-05-02 21:14:00,1.25,15358,United Kingdom,0,0
...,...,...,...,...,...,...,...,...,...,...
381873,580541,23237,SET OF 4 KNICK KNACK TINS LEAF,6,2022-04-28 20:13:00,4.15,15358,United Kingdom,0,0
381874,580541,21680,WOODLAND STICKERS,12,2022-04-28 20:13:00,0.85,15358,United Kingdom,0,0
381875,580542,23560,SET OF 6 RIBBONS COUNTRY STYLE,6,2022-04-28 20:14:00,2.89,15358,United Kingdom,0,0
381876,580542,23562,SET OF 6 RIBBONS PERFECTLY PRETTY,6,2022-04-28 20:14:00,2.89,15358,United Kingdom,0,0


In [27]:
RFM_df.loc[(RFM_df.InvoiceDate == max(RFM_df.InvoiceDate)), :]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,CancledOrder,QuantityCanceled
541894,581587,22631,CIRCUS PARADE LUNCH BOX,12,2011-12-09 12:50:00,1.95,12680,France,0,0
541895,581587,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-12-09 12:50:00,1.65,12680,France,0,0
541896,581587,22555,PLASTERS IN TIN STRONGMAN,12,2011-12-09 12:50:00,1.65,12680,France,0,0
541897,581587,22728,ALARM CLOCK BAKELIKE PINK,4,2011-12-09 12:50:00,3.75,12680,France,0,0
541898,581587,22727,ALARM CLOCK BAKELIKE RED,4,2011-12-09 12:50:00,3.75,12680,France,0,0
541899,581587,22726,ALARM CLOCK BAKELIKE GREEN,4,2011-12-09 12:50:00,3.75,12680,France,0,0
541900,581587,22730,ALARM CLOCK BAKELIKE IVORY,4,2011-12-09 12:50:00,3.75,12680,France,0,0
541901,581587,22367,CHILDRENS APRON SPACEBOY DESIGN,8,2011-12-09 12:50:00,1.95,12680,France,0,0
541902,581587,22629,SPACEBOY LUNCH BOX,12,2011-12-09 12:50:00,1.95,12680,France,0,0
541903,581587,23256,CHILDRENS CUTLERY SPACEBOY,4,2011-12-09 12:50:00,4.15,12680,France,0,0
