In [9]:
# Import libs
import pandas as pd

In [21]:
# Path to the file, that we need for tasks.
path_to_file = 'data.csv'

### Task 1.
The following data on transactions are available from 01.12.2010 to 12.09.2011:

- `InvoiceNo` — transaction number
- `StockCode` — product code
- `Description` — product description
- `Quantity` — the number of units of the product added to the order
- `InvoiceDate` — date of the transaction
- `UnitPrice` — price per unit of goods
- `CustomerID` — customer id
- `Country` — the country where the client resides

In this task, the reference for reading data lies in the path_to_file variable.
Import pandas and read the data with ISO-8859-1 encoding. Write the received dataframe to retail, and save the column names to the retail_columns variable.

In [24]:
retail = pd.read_csv(path_to_file, encoding='ISO-8859-1')

In [25]:
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [26]:
retail_columns = retail.columns
retail_columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [30]:
retail.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object

In [31]:
retail.shape

(541909, 8)

### Task 2.
Check if there are repeated observations in the data, and specify their number as an answer. If there are any, then delete them from retail.

In [38]:
retail.duplicated().sum()

5268

In [40]:
retail.drop_duplicates(inplace=True)

In [42]:
retail.shape

(536641, 8)

In [41]:
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


## Task 3.
The data contains records of both successful transactions and canceled ones. If the user canceled the order, C (cancelled) is placed at the beginning of the transaction number (InvoiceNo).

How many transactions have users canceled in total? Consider each individual type of product as a separate transaction - you don't need to count unique numbers.

In [47]:
retail.InvoiceNo.str.startswith('C').sum()

9251

### Task 4.
Now filter the data and leave in retail only those orders where Quantity > 0. As an answer, specify the number of remaining lines.

In [48]:
retail.query('Quantity > 0', inplace=True)

In [50]:
retail.shape[0]

526054

### Task 5.
Count the number of orders for each user (CustomerID) from Germany (Germany). Leave only those who have made more than N transactions (InvoiceNo), where N is the 80% percentile. Write down the received user IDs in germany_top (not the entire dataframe, only the id).

The filtered data is recorded in retail. The order ID is InvoiceNo. For each order, there may be more than 1 row in the data.

In [53]:
# Count the number of orders for each user (CustomerID) from Germany (Germany)
germany_all = retail[retail.Country == 'Germany'] \
    .groupby('CustomerID', as_index=False) \
    .agg({'InvoiceNo': 'nunique'}) \

# Create variable with 80 percentile
quantile = germany_all.InvoiceNo.quantile(0.8)

# Write calculated result in germany_top variable
germany_top = germany_all[germany_all.InvoiceNo > quantile].CustomerID

### Task 6.
Using the object with the user id (germany_top) obtained in the previous step, filter the observations and leave records in the data only by the users we are interested in. Write the resulting dataframe to top_retail_germany.

In [54]:
top_retail_germany = retail.query('CustomerID in @germany_top')

In [56]:
top_retail_germany.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1109,536527,22809,SET OF 6 T-LIGHTS SANTA,6,12/1/2010 13:04,2.95,12662.0,Germany
1110,536527,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,6,12/1/2010 13:04,2.55,12662.0,Germany
1111,536527,84945,MULTI COLOUR SILVER T-LIGHT HOLDER,12,12/1/2010 13:04,0.85,12662.0,Germany
1112,536527,22242,5 HOOK HANGER MAGIC TOADSTOOL,12,12/1/2010 13:04,1.65,12662.0,Germany
1113,536527,22244,3 HOOK HANGER MAGIC GARDEN,12,12/1/2010 13:04,1.95,12662.0,Germany


### Task 7.
Group top_retail_germany by product code (StockCode). Which of the products were added to the cart most often, except POST?

Note: one order is considered a one-time purchase of any quantity of goods, i.e. without taking into account Quantity.

In [61]:
top_retail_germany.query('StockCode != "POST"').StockCode.value_counts().head(1)

22326    52
Name: StockCode, dtype: int64

### Task 8.
Let's return to the analysis of the full retail dataset. Create a Revenue column with the purchase amount using the Quantity and UnitPrice columns.

In [63]:
retail['Revenue'] = retail.Quantity * retail.UnitPrice

In [64]:
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,20.34


### Task 9.
For each transaction (InvoiceNo), calculate the final order amount. As an answer, specify the top 5 (InvoiceNo) for the order amount (separated by a comma with a space and in descending order of Total Revenue)

In [70]:
retail.groupby('InvoiceNo', as_index=False) \
    .agg({'Revenue':'sum'}) \
    .sort_values('Revenue', ascending=False) \
    .head() \
    .InvoiceNo.to_list()

['581483', '541431', '574941', '576365', '556444']