In [99]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests


# 1- Data Exploration

In [100]:
df = pd.read_csv("Online-Retail.csv")
data = df.copy()

In [101]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [102]:
data.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680.0,France


In [103]:
data.shape

(541909, 8)

In [104]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [105]:
# Check the missing values
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

# 2- Data Cleaning

In [113]:
# Drop rows with missing values in 'Description' and 'CustomerID' columns
data.dropna(subset=['Description', 'CustomerID'], inplace=True)

In [114]:
# Check again the new dataframe
data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [120]:
# rename all the Columns
data_df= data.rename(columns={"InvoiceNo":"invoice_num", 
                   "StockCode":"stock_code", 
                  "Description":"description", 
                  "Quantity":"quantity", 
                  "InvoiceDate":"invoice_date", 
                  "UnitPrice":"unit_price", 
                  "CustomerID":"customer_id", 
                  "Country":"country"}, inplace=True)


In [116]:
# Convert InvoiceDate to datetime
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

In [122]:
data.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [123]:
# the Datatypes from all the columns
data.dtypes

invoice_num             object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
customer_id            float64
country                 object
dtype: object

In [124]:
# convert the column description to string
data["description"] = data["description"].astype(str)

In [126]:
# convert the description column into lower case
data['description'] = data.description.str.lower()

In [127]:
# Round values to 2 decimal places
data.describe().round(2)

Unnamed: 0,quantity,unit_price,customer_id
count,406829.0,406829.0,406829.0
mean,12.06,3.46,15287.69
std,248.69,69.32,1713.6
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


In [128]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice_num   406829 non-null  object        
 1   stock_code    406829 non-null  object        
 2   description   406829 non-null  object        
 3   quantity      406829 non-null  int64         
 4   invoice_date  406829 non-null  datetime64[ns]
 5   unit_price    406829 non-null  float64       
 6   customer_id   406829 non-null  float64       
 7   country       406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [130]:
data.shape

(406829, 8)

In [131]:
data.duplicated().sum()

5225

In [132]:
data_df = data.drop_duplicates()

In [133]:
data_df.shape

(401604, 8)

In [137]:
data_df.columns

Index(['invoice_num', 'stock_code', 'description', 'quantity', 'invoice_date',
       'unit_price', 'customer_id', 'country'],
      dtype='object')

In [138]:
# Quantity smaller than 0
data_filter = data[data['quantity'] < 0]
data_filter

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
141,C536379,D,discount,-1,2010-12-01 09:41:00,27.50,14527.0,United Kingdom
154,C536383,35004C,set of 3 coloured flying ducks,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,plasters in tin circus parade,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,pack of 12 pink paisley tissues,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,pack of 12 blue paisley tissues,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,zinc t-light holder stars small,-11,2011-12-09 09:57:00,0.83,14397.0,United Kingdom
541541,C581499,M,manual,-1,2011-12-09 10:28:00,224.69,15498.0,United Kingdom
541715,C581568,21258,victorian sewing box large,-5,2011-12-09 11:57:00,10.95,15311.0,United Kingdom
541716,C581569,84978,hanging heart jar t-light holder,-1,2011-12-09 11:58:00,1.25,17315.0,United Kingdom


In [140]:
data_df['description'].to_frame()

Unnamed: 0,description
0,white hanging heart t-light holder
1,white metal lantern
2,cream cupid hearts coat hanger
3,knitted union flag hot water bottle
4,red woolly hottie white heart.
...,...
541904,pack of 20 spaceboy napkins
541905,children's apron dolly girl
541906,childrens cutlery dolly girl
541907,childrens cutlery circus parade


In [143]:
data_df['description'].nunique()

3896

In [145]:
discounts = data_df[data_df['stock_code'].apply(lambda order: order=='D')]
discounts.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
141,C536379,D,discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
9038,C537164,D,discount,-1,2010-12-05 13:21:00,29.29,14527.0,United Kingdom
14498,C537597,D,discount,-1,2010-12-07 12:34:00,281.0,15498.0,United Kingdom
19392,C537857,D,discount,-1,2010-12-08 16:00:00,267.12,17340.0,United Kingdom
31134,C538897,D,discount,-1,2010-12-15 09:14:00,5.76,16422.0,United Kingdom


In [146]:
discounts.shape

(77, 8)

In [147]:
post = data[data['stock_code'].apply(lambda order: order=='POST')]
post.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
45,536370,POST,postage,3,2010-12-01 08:45:00,18.0,12583.0,France
386,536403,POST,postage,1,2010-12-01 11:27:00,15.0,12791.0,Netherlands
1123,536527,POST,postage,1,2010-12-01 13:04:00,18.0,12662.0,Germany
5073,536840,POST,postage,1,2010-12-02 18:27:00,18.0,12738.0,Germany
5258,536852,POST,postage,1,2010-12-03 09:51:00,18.0,12686.0,France


In [148]:
post.shape

(1196, 8)

In [149]:
manuel = data[data['stock_code'].apply(lambda order: order=='M')]
manuel.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
2239,536569,M,manual,1,2010-12-01 15:35:00,1.25,16274.0,United Kingdom
2250,536569,M,manual,1,2010-12-01 15:35:00,18.95,16274.0,United Kingdom
6798,536981,M,manual,2,2010-12-03 14:26:00,0.85,14723.0,United Kingdom
7976,537077,M,manual,12,2010-12-05 11:59:00,0.42,17062.0,United Kingdom
8530,537137,M,manual,36,2010-12-05 12:43:00,0.85,16327.0,United Kingdom


In [151]:
manuel.shape

(465, 8)

In [150]:
pads = data[data['stock_code'].apply(lambda order: order=='PADS')]
pads.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
157195,550193,PADS,pads to match all cushions,1,2011-04-15 09:27:00,0.001,13952.0,United Kingdom
279045,561226,PADS,pads to match all cushions,1,2011-07-26 10:13:00,0.001,15618.0,United Kingdom
358655,568158,PADS,pads to match all cushions,1,2011-09-25 12:22:00,0.0,16133.0,United Kingdom
359871,568200,PADS,pads to match all cushions,1,2011-09-25 14:58:00,0.001,16198.0,United Kingdom


In [152]:
pads.shape

(4, 8)

In [153]:
carriage = data[data['stock_code'].apply(lambda order: order=='C2')]
carriage.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1423,536540,C2,carriage,1,2010-12-01 14:05:00,50.0,14911.0,EIRE
12119,537368,C2,carriage,1,2010-12-06 12:40:00,50.0,14911.0,EIRE
12452,537378,C2,carriage,1,2010-12-06 13:06:00,50.0,14911.0,EIRE
19975,537963,C2,carriage,1,2010-12-09 11:30:00,50.0,13369.0,United Kingdom
20016,538002,C2,carriage,1,2010-12-09 11:48:00,50.0,14932.0,Channel Islands


In [154]:
carriage.shape

(134, 8)

In [155]:
bank = data[data['stock_code'].apply(lambda order: order=='BANK CHARGES')]
bank.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
4406,536779,BANK CHARGES,bank charges,1,2010-12-02 15:08:00,15.0,15823.0,United Kingdom
62508,541505,BANK CHARGES,bank charges,1,2011-01-18 15:58:00,15.0,15939.0,United Kingdom
152966,549717,BANK CHARGES,bank charges,1,2011-04-11 14:56:00,15.0,14606.0,United Kingdom
175275,551945,BANK CHARGES,bank charges,1,2011-05-05 11:09:00,15.0,16714.0,United Kingdom
327921,565735,BANK CHARGES,bank charges,1,2011-09-06 12:25:00,15.0,16904.0,United Kingdom


In [156]:
bank.shape

(12, 8)

In [158]:
data_df.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [160]:
lantern = data_df[data_df['description'].apply(lambda order: order=='white metal lantern')]
lantern

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
50,536373,71053,white metal lantern,6,2010-12-01 09:02:00,3.39,17850.0,United Kingdom
67,536375,71053,white metal lantern,6,2010-12-01 09:32:00,3.39,17850.0,United Kingdom
279,536396,71053,white metal lantern,6,2010-12-01 10:51:00,3.39,17850.0,United Kingdom
417,536406,71053,white metal lantern,8,2010-12-01 11:33:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
467347,576339,71053,white metal lantern,1,2011-11-14 15:27:00,8.29,14096.0,United Kingdom
468547,576513,71053,white metal lantern,8,2011-11-15 12:10:00,3.75,14715.0,United Kingdom
471383,576642,71053,white metal lantern,4,2011-11-16 09:59:00,3.75,12963.0,United Kingdom
471907,576654,71053,white metal lantern,1,2011-11-16 10:55:00,3.75,15584.0,United Kingdom


In [161]:
import requests

# Define the base URL for the Axesso Amazon Data API
base_url = "https://axesso-axesso-amazon-data-service-v1.p.rapidapi.com/amz/amazon-search-by-keyword-asin"

# Initialize a list to store the extracted data
extracted_data = []

# Assuming you have a DataFrame named df with a "Description" column
for description in data_df["description"].unique():
    # Query parameters
    querystring = {
        "domainCode": "com",
        "keyword": description,
        "page": "1",
        "excludeSponsored": "false",
        "sortBy": "relevanceblender",
        "withCache": "true",
    }

    # Send a GET request to the API
    headers = {
        "X-RapidAPI-Host": "axesso-axesso-amazon-data-service-v1.p.rapidapi.com",
        "X-RapidAPI-Key": "c063c04b61msh832fd71edb9c039p18964fjsn97d2afe8f329E", 
    }
    response = requests.get(base_url, headers=headers, params=querystring)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract and process the JSON response
        json_data = response.json()

        # Extract the first product's name and price if available
        if "items" in json_data and json_data["items"]:
            first_item = json_data["items"][0]
            product_name = first_item.get("productTitle", "N/A")
            product_price = first_item.get("price", "N/A")

            extracted_data.append({
                "Description": description,
                "Product_name": product_name,
                "Price": product_price,
            })
    else:
        print(f"Failed to fetch data for keyword: {description}")

# Print the extracted data
for item in extracted_data:
    print("Description:", item["Description"])
    print("Product_name:", item["Product_name"])
    print("Price:", item["Price"])
    print("\n")


Failed to fetch data for keyword: white hanging heart t-light holder
Failed to fetch data for keyword: white metal lantern
Failed to fetch data for keyword: cream cupid hearts coat hanger
Failed to fetch data for keyword: knitted union flag hot water bottle
Failed to fetch data for keyword: red woolly hottie white heart.
Failed to fetch data for keyword: set 7 babushka nesting boxes
Failed to fetch data for keyword: glass star frosted t-light holder
Failed to fetch data for keyword: hand warmer union jack
Failed to fetch data for keyword: hand warmer red polka dot
Failed to fetch data for keyword: assorted colour bird ornament
Failed to fetch data for keyword: poppy's playhouse bedroom 
Failed to fetch data for keyword: poppy's playhouse kitchen
Failed to fetch data for keyword: feltcraft princess charlotte doll
Failed to fetch data for keyword: ivory knitted mug cosy 
Failed to fetch data for keyword: box of 6 assorted colour teaspoons
Failed to fetch data for keyword: box of vintage 

#### Failed because the adesso_API has only a limit to 48 items per call

In [162]:
import requests

# Define the base URL for the Axesso Amazon Data API
base_url = "https://axesso-axesso-amazon-data-service-v1.p.rapidapi.com/amz/amazon-search-by-keyword-asin"

# Initialize a list to store the extracted data
extracted_data = []

# Assuming you have a DataFrame named df with a "Description" column
descriptions = data_df["description"].unique()

# Define the maximum number of items per API call
max_items_per_call = 47

# Paginate through the descriptions
for i in range(0, len(descriptions), max_items_per_call):
    # Extract a batch of descriptions
    batch_descriptions = descriptions[i:i+max_items_per_call]
    
    # Query parameters
    querystring = {
        "domainCode": "com",
        "keyword": ",".join(batch_descriptions),
        "page": "1",
        "excludeSponsored": "false",
        "sortBy": "relevanceblender",
        "withCache": "true",
    }

    # Send a GET request to the API
    headers = {
        "X-RapidAPI-Host": "axesso-axesso-amazon-data-service-v1.p.rapidapi.com",
        "X-RapidAPI-Key": "c063c04b61msh832fd71edb9c039p18964fjsn97d2afe8f329E", 
    }
    response = requests.get(base_url, headers=headers, params=querystring)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract and process the JSON response
        json_data = response.json()

        # Extract product data for each description
        for item in json_data.get("items", []):
            product_name = item.get("productTitle", "N/A")
            product_price = item.get("price", "N/A")

            extracted_data.append({
                "Description": item["keyword"],
                "Product_name": product_name,
                "Price": product_price,
            })
    else:
        print(f"Failed to fetch data for batch {i+1}-{i+len(batch_descriptions)}")

# Print the extracted data
for item in extracted_data:
    print("Description:", item["Description"])
    print("Product_name:", item["Product_name"])
    print("Price:", item["Price"])
    print("\n")


Failed to fetch data for batch 1-47
Failed to fetch data for batch 48-94
Failed to fetch data for batch 95-141
Failed to fetch data for batch 142-188
Failed to fetch data for batch 189-235
Failed to fetch data for batch 236-282
Failed to fetch data for batch 283-329
Failed to fetch data for batch 330-376
Failed to fetch data for batch 377-423
Failed to fetch data for batch 424-470
Failed to fetch data for batch 471-517
Failed to fetch data for batch 518-564
Failed to fetch data for batch 565-611
Failed to fetch data for batch 612-658
Failed to fetch data for batch 659-705
Failed to fetch data for batch 706-752
Failed to fetch data for batch 753-799
Failed to fetch data for batch 800-846
Failed to fetch data for batch 847-893
Failed to fetch data for batch 894-940
Failed to fetch data for batch 941-987
Failed to fetch data for batch 988-1034
Failed to fetch data for batch 1035-1081
Failed to fetch data for batch 1082-1128
Failed to fetch data for batch 1129-1175
Failed to fetch data for