In [1]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests


# 1- Data Exploration

In [2]:
df = pd.read_csv("Online-Retail.csv")
data = df.copy()

In [3]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
data.tail()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680.0,France


In [5]:
data.shape

(541909, 8)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [7]:
# Check the missing values
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

# 2- Data Cleaning

In [8]:
# Drop rows with missing values in 'Description' and 'CustomerID' columns
data.dropna(subset=['Description', 'CustomerID'], inplace=True)

In [9]:
# Check again the new dataframe
data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [10]:
# rename all the Columns
data_df= data.rename(columns={"InvoiceNo":"invoice_num", 
                   "StockCode":"stock_code", 
                  "Description":"description", 
                  "Quantity":"quantity", 
                  "InvoiceDate":"invoice_date", 
                  "UnitPrice":"unit_price", 
                  "CustomerID":"customer_id", 
                  "Country":"country"}, inplace=True)


In [11]:
# Convert InvoiceDate to datetime
data['invoice_date'] = pd.to_datetime(data['invoice_date'])

In [12]:
data.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [13]:
# the Datatypes from all the columns
data.dtypes

invoice_num             object
stock_code              object
description             object
quantity                 int64
invoice_date    datetime64[ns]
unit_price             float64
customer_id            float64
country                 object
dtype: object

In [14]:
# convert the column description to string
data["description"] = data["description"].astype(str)

In [15]:
# convert the description column into lower case
data['description'] = data.description.str.lower()

In [16]:
# Round values to 2 decimal places
data.describe().round(2)

Unnamed: 0,quantity,unit_price,customer_id
count,406829.0,406829.0,406829.0
mean,12.06,3.46,15287.69
std,248.69,69.32,1713.6
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   invoice_num   406829 non-null  object        
 1   stock_code    406829 non-null  object        
 2   description   406829 non-null  object        
 3   quantity      406829 non-null  int64         
 4   invoice_date  406829 non-null  datetime64[ns]
 5   unit_price    406829 non-null  float64       
 6   customer_id   406829 non-null  float64       
 7   country       406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [18]:
data.shape

(406829, 8)

In [19]:
data.duplicated().sum()

5225

In [20]:
data_df = data.drop_duplicates()

In [21]:
data_df.shape

(401604, 8)

In [22]:
data_df.columns

Index(['invoice_num', 'stock_code', 'description', 'quantity', 'invoice_date',
       'unit_price', 'customer_id', 'country'],
      dtype='object')

In [23]:
# Quantity smaller than 0
data_filter = data[data['quantity'] < 0]
data_filter

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
141,C536379,D,discount,-1,2010-12-01 09:41:00,27.50,14527.0,United Kingdom
154,C536383,35004C,set of 3 coloured flying ducks,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,plasters in tin circus parade,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,pack of 12 pink paisley tissues,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,pack of 12 blue paisley tissues,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,zinc t-light holder stars small,-11,2011-12-09 09:57:00,0.83,14397.0,United Kingdom
541541,C581499,M,manual,-1,2011-12-09 10:28:00,224.69,15498.0,United Kingdom
541715,C581568,21258,victorian sewing box large,-5,2011-12-09 11:57:00,10.95,15311.0,United Kingdom
541716,C581569,84978,hanging heart jar t-light holder,-1,2011-12-09 11:58:00,1.25,17315.0,United Kingdom


In [24]:
data_df['description'].to_frame()

Unnamed: 0,description
0,white hanging heart t-light holder
1,white metal lantern
2,cream cupid hearts coat hanger
3,knitted union flag hot water bottle
4,red woolly hottie white heart.
...,...
541904,pack of 20 spaceboy napkins
541905,children's apron dolly girl
541906,childrens cutlery dolly girl
541907,childrens cutlery circus parade


In [25]:
data_df['description'].nunique()

3896

In [26]:
discounts = data_df[data_df['stock_code'].apply(lambda order: order=='D')]
discounts.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
141,C536379,D,discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
9038,C537164,D,discount,-1,2010-12-05 13:21:00,29.29,14527.0,United Kingdom
14498,C537597,D,discount,-1,2010-12-07 12:34:00,281.0,15498.0,United Kingdom
19392,C537857,D,discount,-1,2010-12-08 16:00:00,267.12,17340.0,United Kingdom
31134,C538897,D,discount,-1,2010-12-15 09:14:00,5.76,16422.0,United Kingdom


In [27]:
discounts.shape

(77, 8)

In [28]:
post = data[data['stock_code'].apply(lambda order: order=='POST')]
post.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
45,536370,POST,postage,3,2010-12-01 08:45:00,18.0,12583.0,France
386,536403,POST,postage,1,2010-12-01 11:27:00,15.0,12791.0,Netherlands
1123,536527,POST,postage,1,2010-12-01 13:04:00,18.0,12662.0,Germany
5073,536840,POST,postage,1,2010-12-02 18:27:00,18.0,12738.0,Germany
5258,536852,POST,postage,1,2010-12-03 09:51:00,18.0,12686.0,France


In [29]:
post.shape

(1196, 8)

In [30]:
manuel = data[data['stock_code'].apply(lambda order: order=='M')]
manuel.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
2239,536569,M,manual,1,2010-12-01 15:35:00,1.25,16274.0,United Kingdom
2250,536569,M,manual,1,2010-12-01 15:35:00,18.95,16274.0,United Kingdom
6798,536981,M,manual,2,2010-12-03 14:26:00,0.85,14723.0,United Kingdom
7976,537077,M,manual,12,2010-12-05 11:59:00,0.42,17062.0,United Kingdom
8530,537137,M,manual,36,2010-12-05 12:43:00,0.85,16327.0,United Kingdom


In [31]:
manuel.shape

(465, 8)

In [32]:
pads = data[data['stock_code'].apply(lambda order: order=='PADS')]
pads.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
157195,550193,PADS,pads to match all cushions,1,2011-04-15 09:27:00,0.001,13952.0,United Kingdom
279045,561226,PADS,pads to match all cushions,1,2011-07-26 10:13:00,0.001,15618.0,United Kingdom
358655,568158,PADS,pads to match all cushions,1,2011-09-25 12:22:00,0.0,16133.0,United Kingdom
359871,568200,PADS,pads to match all cushions,1,2011-09-25 14:58:00,0.001,16198.0,United Kingdom


In [33]:
pads.shape

(4, 8)

In [34]:
carriage = data[data['stock_code'].apply(lambda order: order=='C2')]
carriage.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1423,536540,C2,carriage,1,2010-12-01 14:05:00,50.0,14911.0,EIRE
12119,537368,C2,carriage,1,2010-12-06 12:40:00,50.0,14911.0,EIRE
12452,537378,C2,carriage,1,2010-12-06 13:06:00,50.0,14911.0,EIRE
19975,537963,C2,carriage,1,2010-12-09 11:30:00,50.0,13369.0,United Kingdom
20016,538002,C2,carriage,1,2010-12-09 11:48:00,50.0,14932.0,Channel Islands


In [35]:
carriage.shape

(134, 8)

In [36]:
bank = data[data['stock_code'].apply(lambda order: order=='BANK CHARGES')]
bank.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
4406,536779,BANK CHARGES,bank charges,1,2010-12-02 15:08:00,15.0,15823.0,United Kingdom
62508,541505,BANK CHARGES,bank charges,1,2011-01-18 15:58:00,15.0,15939.0,United Kingdom
152966,549717,BANK CHARGES,bank charges,1,2011-04-11 14:56:00,15.0,14606.0,United Kingdom
175275,551945,BANK CHARGES,bank charges,1,2011-05-05 11:09:00,15.0,16714.0,United Kingdom
327921,565735,BANK CHARGES,bank charges,1,2011-09-06 12:25:00,15.0,16904.0,United Kingdom


In [37]:
bank.shape

(12, 8)

In [38]:
data_df.head()

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [39]:
lantern = data_df[data_df['description'].apply(lambda order: order=='white metal lantern')]
lantern

Unnamed: 0,invoice_num,stock_code,description,quantity,invoice_date,unit_price,customer_id,country
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
50,536373,71053,white metal lantern,6,2010-12-01 09:02:00,3.39,17850.0,United Kingdom
67,536375,71053,white metal lantern,6,2010-12-01 09:32:00,3.39,17850.0,United Kingdom
279,536396,71053,white metal lantern,6,2010-12-01 10:51:00,3.39,17850.0,United Kingdom
417,536406,71053,white metal lantern,8,2010-12-01 11:33:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
467347,576339,71053,white metal lantern,1,2011-11-14 15:27:00,8.29,14096.0,United Kingdom
468547,576513,71053,white metal lantern,8,2011-11-15 12:10:00,3.75,14715.0,United Kingdom
471383,576642,71053,white metal lantern,4,2011-11-16 09:59:00,3.75,12963.0,United Kingdom
471907,576654,71053,white metal lantern,1,2011-11-16 10:55:00,3.75,15584.0,United Kingdom


In [40]:
"X-RapidAPI-Key": "8a6dcc8cffmshe0cb2df6a87e7afp1c68bdjsnaf5057047f7c",
	"X-RapidAPI-Host": "amazon-data.p.rapidapi.com"

SyntaxError: invalid syntax (Temp/ipykernel_15960/4285139051.py, line 1)

In [80]:
import requests

url = "https://amazon-data.p.rapidapi.com/search.php"

querystring = {"keyword":"set 7 babushka nesting boxes","region":"us","page":"1"}

headers = {
    "X-RapidAPI-Key": "8a6dcc8cffmshe0cb2df6a87e7afp1c68bdjsnaf5057047f7c",
    "X-RapidAPI-Host": "amazon-data.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

[]


In [42]:
res = response.json()

In [52]:

res[0]['asin_name']

'Kate Aspen Rustic Farmhouse Decor White Candle Holder Lantern Decorative (7.5 Inch), Wedding Decoration, Centerpiece Table Decorations, Home Decor, Shelf Decor'

In [53]:
res[0]['asin_price']

17.89

In [54]:
# test
import requests
import time
import pandas as pd

url = "https://amazon-data.p.rapidapi.com/search.php"

headers = {
    "X-RapidAPI-Key": "8a6dcc8cffmshe0cb2df6a87e7afp1c68bdjsnaf5057047f7c",
    "X-RapidAPI-Host": "amazon-data.p.rapidapi.com"
}
amazon_price, amazon_product_description = [], []

for index, item in data_df.iterrows():
    
    if index < 3:
        querystring = {"keyword":item['description'],"region":"us","page":"1"}
        response = requests.get(url, headers=headers, params=querystring).json()

        print("Requete No: ", index)
        print("Column description from dataframe: ", item['description'])
        print("Product description from response: ",response[0]['asin_name'])
        print("Price from response: ", response[0]['asin_price'])
        print("======================================")
        print(" ")
        
        amazon_product_description.append(response[0]['asin_name'])
        amazon_price.append(response[0]['asin_price'])
        time.sleep(1)
        
#data_df["Amazon Price"] = list_price
df_test = pd.DataFrame()
df_test = pd.DataFrame({"Amazon Price": amazon_price, "Product Description": amazon_product_description})
print("finish!")

Requete No:  0
Column description from dataframe:  white hanging heart t-light holder
Product description from response:  Mudder Valentine's Day Wooden Heart String Lights LED Fairy Lights Hanging Wood Love Lights Lamp Battery Operated Valentine's Day Decorations for Bedroom Festival Birthday Wedding (White,20 Lights)
Price from response:  10.99
 
Requete No:  1
Column description from dataframe:  white metal lantern
Product description from response:  Kate Aspen Rustic Farmhouse Decor White Candle Holder Lantern Decorative (7.5 Inch), Wedding Decoration, Centerpiece Table Decorations, Home Decor, Shelf Decor
Price from response:  17.89
 
Requete No:  2
Column description from dataframe:  cream cupid hearts coat hanger
Product description from response:  50 Pieces Cute Heart Hangers with 360 Degree Swivel Hook Heavy Duty Clothes Hanger Adult Coat Hangers for Jackets, Pants, Shirts, Suit, Dress Room Closet Decor(Pink,Plastic)
Price from response:  40.94
 
finish!


In [55]:
df_test

Unnamed: 0,Amazon Price,Product Description
0,10.99,Mudder Valentine's Day Wooden Heart String Lig...
1,17.89,Kate Aspen Rustic Farmhouse Decor White Candle...
2,40.94,50 Pieces Cute Heart Hangers with 360 Degree S...


In [56]:
data_df.shape

(401604, 8)

In [62]:
data_df['description'].nunique()

3896

In [None]:
# 10.000 requests / month

In [64]:
unique_data = 

In [69]:
df_amazon_data = pd.DataFrame()
df_amazon_data['description'] = data_df['description'].unique()
df_amazon_data

Unnamed: 0,description
0,white hanging heart t-light holder
1,white metal lantern
2,cream cupid hearts coat hanger
3,knitted union flag hot water bottle
4,red woolly hottie white heart.
...,...
3891,"letter ""w"" bling key ring"
3892,"letter ""z"" bling key ring"
3893,pink crystal skull phone charm
3894,cream hanging heart t-light holder


In [79]:
df_amazon_data.description[5]

'set 7 babushka nesting boxes'

In [None]:
import requests
import time
import pandas as pd

url = "https://amazon-data.p.rapidapi.com/search.php"

headers = {
    "X-RapidAPI-Key": "8a6dcc8cffmshe0cb2df6a87e7afp1c68bdjsnaf5057047f7c",
    "X-RapidAPI-Host": "amazon-data.p.rapidapi.com"
}
amazon_price, amazon_product_description = [], []

for item in df_amazon_data['description']:
    querystring = {"keyword":item,"region":"us","page":"1"}
    result = requests.get(url, headers=headers, params=querystring)
    print(result)
    try:
        response = result.json()
        amazon_product_description.append(response[0]['asin_name'])
        amazon_price.append(response[0]['asin_price'])
        time.sleep(0.15)
    except:
        print("Error:")
        response = None
        amazon_product_description.append(response)
        amazon_price.append(response)
        time.sleep(1)

df_amazon_data.loc[:, "amazon product name"] = amazon_product_description
df_amazon_data.loc[:, "amazon price"] = amazon_price

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
Error:
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
Error:
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
Error:
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response 

In [82]:
len(amazon_product_description)

611

In [87]:
data = {
    "amazon_product_description": amazon_product_description,
    "amazon_price": amazon_price
}

df = pd.DataFrame(data)


In [90]:
df.to_excel("df.xlsx")

In [319]:
# df_prod.to_excel("df_prod.xlsx")

In [320]:
# df_prod.to_parquet("df_prod.parquet")