In [35]:
import pandas as pd
import numpy as np

stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 2, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
})

stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


# 1. Indexing 

In [12]:
# Item No. has double values in there
stock.set_index('item_no', inplace = True)

In [13]:
# Call new table
stock.head()

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
2,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20


In [7]:
# Check that the new index is unique
stock.index.is_unique

False

In [14]:
# Find the row which is a duplicate
stock.loc[stock.index.duplicated()]

# Tells us that item_no 2 is duplicated

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,3rd,2.99,c,a,20


In [16]:
# Try and access row '2'
stock.loc[2]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,2nd,,a,,0
2,3rd,2.99,c,a,20


In [17]:
# Trying to reset the index doesn't work
stock.index[2 = 3]

# No likey

SyntaxError: invalid syntax (<ipython-input-17-3a6defefb6c4>, line 2)

* Can provide a collection of values that we want as our index
* Use the .rename method to fix it

In [39]:
# Provide new values for an index
stock.index = range(1,11)
stock.index.name = "item_no"
stock.drop('item_no', 1, inplace = True)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20


### Task: reset the index

In [33]:
# Drop - Do not try to insert index into dataframe columns. This resets 
# the index to the default integer index.

stock.reset_index(drop = False, inplace = True)
stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


In [34]:
# Change to original
stock.set_index("item_no", inplace = True)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
2,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20
6,3rd,2.45,b,a,0
7,2nd,5.99,,e,20
8,,5.99,,,20
9,1st,3.0,a,a,5
10,3rd,,c,d,20


In [40]:
stock.iloc[[0,1,2], :]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20


# Chained Indexing 

In [41]:
stock['cost'][[1,2]]

item_no
1    10.99
2      NaN
Name: cost, dtype: float64

In [42]:
stock.loc[[1,2], 'cost']

item_no
1    10.99
2      NaN
Name: cost, dtype: float64

# Indexing for Alignment

In [44]:
new_series = pd.Series(['a', 'b', 'c', 'd'], index = [2, 3, 4, 6])
new_series

2    a
3    b
4    c
6    d
dtype: object

In [46]:
stock.loc[:, 'new'] = new_series
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,
2,2nd,,a,,0,a
3,3rd,2.99,c,a,20,b
4,4th,,b,b,20,c
5,4th,2.99,a,,20,
6,3rd,2.45,b,a,0,d
7,2nd,5.99,,e,20,
8,,5.99,,,20,
9,1st,3.0,a,a,5,
10,3rd,,c,d,20,


# Indexing: Missing Values

In [47]:
stock.loc[stock.cost.isna()]
# 2,4,10 in the cost column have missing values

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2nd,,a,,0,a
4,4th,,b,b,20,c
10,3rd,,c,d,20,


In [48]:
# Find any missing rows
stock.loc[stock.isna().any(axis = 'columns')]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,
2,2nd,,a,,0,a
4,4th,,b,b,20,c
5,4th,2.99,a,,20,
7,2nd,5.99,,e,20,
8,,5.99,,,20,
9,1st,3.0,a,a,5,
10,3rd,,c,d,20,


In [49]:
# Find complete rows
stock.loc[~stock.isna().any(axis = 'columns')]

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,new
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,3rd,2.99,c,a,20,b
6,3rd,2.45,b,a,0,d


In [50]:
stock.notna().all(axis='rows')
# Only tax_rate has no missing values

cost_class       False
cost             False
stock_code       False
priority_code    False
tax_rate          True
new              False
dtype: bool

In [60]:
# Find all complete columns
stock.loc[:, stock.notna().all(axis='rows')]

Unnamed: 0_level_0,tax_rate
item_no,Unnamed: 1_level_1
1,0
2,0
3,20
4,20
5,20
6,0
7,20
8,20
9,5
10,20


In [62]:
# Count the missing values in each column
(
    stock
    .isna()
    .sum(axis = 'rows')
)

cost_class       1
cost             3
stock_code       2
priority_code    4
tax_rate         0
new              6
dtype: int64

In [111]:
stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
}).set_index('item_no')

stock.head()

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1st,10.99,a,,0
2,2nd,,a,,0
3,3rd,2.99,c,a,20
4,4th,,b,b,20
5,4th,2.99,a,,20


# Extra Grouping 

In [67]:
stock.groupby('stock_code').mean()

Unnamed: 0_level_0,cost,tax_rate
stock_code,Unnamed: 1_level_1,Unnamed: 2_level_1
a,5.66,6.25
b,2.45,10.0
c,2.99,20.0


In [69]:
# One aggregater at a time
stock.groupby('stock_code').min()

Unnamed: 0_level_0,cost_class,cost,priority_code,tax_rate
stock_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,1st,2.99,a,0
b,3rd,2.45,a,0
c,3rd,2.99,a,20


In [70]:
stock.groupby("stock_code").agg(['count', 'min', 'max'])

Unnamed: 0_level_0,cost_class,cost_class,cost_class,cost,cost,cost,priority_code,priority_code,priority_code,tax_rate,tax_rate,tax_rate
Unnamed: 0_level_1,count,min,max,count,min,max,count,min,max,count,min,max
stock_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
a,4,1st,4th,3,2.99,10.99,1,a,a,4,0,20
b,2,3rd,4th,1,2.45,2.45,2,a,b,2,0,20
c,2,3rd,3rd,1,2.99,2.99,2,a,d,2,20,20


In [74]:
groupby_stock_code= stock.groupby('stock_code')

multiple_aggs = groupby_stock_code.agg(['count', 'min', 'max'])

multiple_aggs

Unnamed: 0_level_0,cost_class,cost_class,cost_class,cost,cost,cost,priority_code,priority_code,priority_code,tax_rate,tax_rate,tax_rate
Unnamed: 0_level_1,count,min,max,count,min,max,count,min,max,count,min,max
stock_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
a,4,1st,4th,3,2.99,10.99,1,a,a,4,0,20
b,2,3rd,4th,1,2.45,2.45,2,a,b,2,0,20
c,2,3rd,3rd,1,2.99,2.99,2,a,d,2,20,20


In [75]:
multiple_aggs.columns

MultiIndex([(   'cost_class', 'count'),
            (   'cost_class',   'min'),
            (   'cost_class',   'max'),
            (         'cost', 'count'),
            (         'cost',   'min'),
            (         'cost',   'max'),
            ('priority_code', 'count'),
            ('priority_code',   'min'),
            ('priority_code',   'max'),
            (     'tax_rate', 'count'),
            (     'tax_rate',   'min'),
            (     'tax_rate',   'max')],
           )

In [78]:
multiple_aggs.loc['a', ('cost', ['max', 'min'])]

cost  max    10.99
      min     2.99
Name: a, dtype: object

In [80]:
others_aggs = groupby_stock_code.agg({
    'cost' : ['mean', 'count', 'sum'],
    'cost_class' : ['count'],
    'priority_code' : 'count'
})

others_aggs

Unnamed: 0_level_0,cost,cost,cost,cost_class,priority_code
Unnamed: 0_level_1,mean,count,sum,count,count
stock_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,5.66,3,16.98,4,1
b,2.45,1,2.45,2,2
c,2.99,1,2.99,2,2


In [83]:
groupby_stock_code.agg(
    total_cost = ('cost', 'sum'),
    no_of_items = ('cost', 'count'),
    mean_tax_rate = ('tax_rate', 'mean')
)

Unnamed: 0_level_0,total_cost,no_of_items,mean_tax_rate
stock_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,16.98,3,6.25
b,2.45,1,10.0
c,2.99,1,20.0


## Taskles

#### Write code for the following:
* #### Group the stock DataFrame by cost_class and stock_code, and then calculate the mean of all numerical columns.
* #### Extract the mean(cost) for items in the 3rd cost_class with stock_code 'c'
* #### Interpret what you see if you also pass dropna=False into .groupby()

In [97]:
groupby_2= stock.groupby(['cost_class', 'stock_code'])

multiple_aggs_2 = groupby_2.agg(['mean'])

multiple_aggs_2

Unnamed: 0_level_0,Unnamed: 1_level_0,cost,tax_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean
cost_class,stock_code,Unnamed: 2_level_2,Unnamed: 3_level_2
1st,a,6.995,2.5
2nd,a,,0.0
3rd,b,2.45,0.0
3rd,c,2.99,20.0
4th,a,2.99,20.0
4th,b,,20.0


In [101]:
multiple_aggs_2.loc[('3rd', 'c'), 'cost']

# Uses a Tuple for the groups - so need to pass it through normal brackets

mean    2.99
Name: (3rd, c), dtype: float64

In [103]:
stock.groupby(['cost_class', 'stock_code'], dropna = False).mean()

# Handy for survey data - as we group people who didn't answer certain questions

Unnamed: 0_level_0,Unnamed: 1_level_0,cost,tax_rate
cost_class,stock_code,Unnamed: 2_level_1,Unnamed: 3_level_1
1st,a,6.995,2.5
2nd,a,,0.0
2nd,,5.99,20.0
3rd,b,2.45,0.0
3rd,c,2.99,20.0
4th,a,2.99,20.0
4th,b,,20.0
,,5.99,20.0


In [112]:
stock = stock.assign(next_year = 2023, checked = True)
# columns don't persist

stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,next_year,checked
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st,10.99,a,,0,2023,True
2,2nd,,a,,0,2023,True
3,3rd,2.99,c,a,20,2023,True
4,4th,,b,b,20,2023,True
5,4th,2.99,a,,20,2023,True
6,3rd,2.45,b,a,0,2023,True
7,2nd,5.99,,e,20,2023,True
8,,5.99,,,20,2023,True
9,1st,3.0,a,a,5,2023,True
10,3rd,,c,d,20,2023,True


In [114]:
adjust_lookup = {
    '1st': 12.5,
    '2nd': 5,
    '3rd': 0,
    '4th': -5,
    pd.NA: np.nan
}
adjust_lookup

{'1st': 12.5, '2nd': 5, '3rd': 0, '4th': -5, <NA>: nan}

In [115]:
adjust_lookup.get('4th', np.nan)

-5

In [116]:
adjust_lookup.get('5th', np.nan)

nan

In [119]:
# Add a new cost adjustment column based on the lookup_table

stock.loc[:, 'cost_adjustment'] = \
[adjust_lookup.get(i, np.nan) for i in stock.cost_class]

stock


Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,next_year,checked,cost_adjustment
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1st,10.99,a,,0,2023,True,12.5
2,2nd,,a,,0,2023,True,5.0
3,3rd,2.99,c,a,20,2023,True,0.0
4,4th,,b,b,20,2023,True,-5.0
5,4th,2.99,a,,20,2023,True,-5.0
6,3rd,2.45,b,a,0,2023,True,0.0
7,2nd,5.99,,e,20,2023,True,5.0
8,,5.99,,,20,2023,True,
9,1st,3.0,a,a,5,2023,True,12.5
10,3rd,,c,d,20,2023,True,0.0


# Adding columns from others columns / using round

In [123]:
# Add a column that computes cont including tax
stock.loc[:, 'cost_inc_tax'] = \
stock.cost + stock.tax_rate * stock.cost / 100

stock.loc[:, 'cost_inc_tax'] = np.round(stock.cost_inc_tax, decimals = 2)
stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,next_year,checked,cost_adjustment,cost_inc_tax
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1st,10.99,a,,0,2023,True,12.5,10.99
2,2nd,,a,,0,2023,True,5.0,
3,3rd,2.99,c,a,20,2023,True,0.0,3.59
4,4th,,b,b,20,2023,True,-5.0,
5,4th,2.99,a,,20,2023,True,-5.0,3.59
6,3rd,2.45,b,a,0,2023,True,0.0,2.45
7,2nd,5.99,,e,20,2023,True,5.0,7.19
8,,5.99,,,20,2023,True,,7.19
9,1st,3.0,a,a,5,2023,True,12.5,3.15
10,3rd,,c,d,20,2023,True,0.0,


# Joining, Melting and Pivoting

## Joining - merge()

* specify data-sets
* the join you want
* the key/column you join on

In [147]:
stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
}).set_index('item_no')

feedback = pd.DataFrame({
    'item_no': pd.Series([2, 2, 3, 4, 5, 1, 9, 5, 7, 10, np.nan], dtype='Int64'),
    'date': pd.Series(['2020-04-11', '2020-04-12', '2020-05-13', np.nan, '2020-05-28', '2020-05-29',
                       '2020-06-01', '2020-06-07', '2020-06-300', '2020-06-30', '2020-08-01']),
    'rating': pd.Series([5, 1, 3, 5, 4, 3, 2, 5, 1, 4, 5], dtype='Int64'),
    'message': pd.Series(["Ideal for my lunchbox - Dave Smith", "Broke first time I used it, I want a refund! Get back to me at lenore29@gmail.com or 07700 900796",
                        "My name is Tony 07700900829", "Bought another one for my sister", "Works pretty well, but can't handle carrots", 
                        "The concept is great, the execution- not so great, thin handles - Eleanor & dave", np.nan,
                        "Arrived on time, as expected", "Customer service terrible - hello anyone there?! DaveAllsop@yahoo.co.uk, 07700 900572 or 0131 9496 0886", 
                        "Workks well, seems solid, good value", "Great finish on it, really decent build quality"], dtype='string')
})

sales = pd.DataFrame({
    'item_number': pd.Series([1, 2, 3, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'target_class': pd.Series(['a', 'b', 'b', 'c', 'c', 'b', 'a', 'a', 'a']),
    'days_in_reduction': pd.Series([0, 7, 14, 14, 0, 0, 7, 14, 30]),
    'days_sales_0_50':   pd.Series([120, 19, 282, 210, 194, 101, 298, 187, 103], dtype='Int64'),
    'days_sales_51_100': pd.Series([141, 341, 22, np.nan, 112, 87, 54, 130, 105], dtype='Int64'),
    'days_sales_101plus':   pd.Series([99, np.nan, 16, 49, 54, 130, np.nan, 23, 152], dtype='Int64')
})

In [136]:
stock.merge(feedback, how = 'inner', on = 'item_no')

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,date,rating,message
0,1,1st,10.99,a,,0,2020-05-29,3,"The concept is great, the execution- not so gr..."
1,2,2nd,,a,,0,2020-04-11,5,Ideal for my lunchbox - Dave Smith
2,2,2nd,,a,,0,2020-04-12,1,"Broke first time I used it, I want a refund! G..."
3,3,3rd,2.99,c,a,20,2020-05-13,3,My name is Tony 07700900829
4,4,4th,,b,b,20,,5,Bought another one for my sister
5,5,4th,2.99,a,,20,2020-05-28,4,"Works pretty well, but can't handle carrots"
6,5,4th,2.99,a,,20,2020-06-07,5,"Arrived on time, as expected"
7,7,2nd,5.99,,e,20,2020-06-300,1,Customer service terrible - hello anyone there...
8,9,1st,3.0,a,a,5,2020-06-01,2,
9,10,3rd,,c,d,20,2020-06-30,4,"Workks well, seems solid, good value"


How can we amend the code for the following:

* Get the details of all items for which feedback has been left, together with feedback rating and date

In [138]:
stock.merge(feedback.loc[:,['item_no','date', 'rating']], how='left', on='item_no')

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,date,rating
0,1,1st,10.99,a,,0,2020-05-29,3
1,2,2nd,,a,,0,2020-04-11,5
2,2,2nd,,a,,0,2020-04-12,1
3,3,3rd,2.99,c,a,20,2020-05-13,3
4,4,4th,,b,b,20,,5
5,5,4th,2.99,a,,20,2020-05-28,4
6,5,4th,2.99,a,,20,2020-06-07,5
7,7,2nd,5.99,,e,20,2020-06-300,1
8,9,1st,3.0,a,a,5,2020-06-01,2
9,10,3rd,,c,d,20,2020-06-30,4


In [150]:
# Taskles

# sales = sales.rename(columns = {'item_number' : 'item_no'})
(
    stock
    .reset_index()
    .merge(sales, how = 'left', left_on = 'item_no', right_on = 'item_number')
    .drop(columns = 'item_number')
)


Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,target_class,days_in_reduction,days_sales_0_50,days_sales_51_100,days_sales_101plus
0,1,1st,10.99,a,,0,a,0.0,120.0,141.0,99.0
1,2,2nd,,a,,0,b,7.0,19.0,341.0,
2,3,3rd,2.99,c,a,20,b,14.0,282.0,22.0,16.0
3,4,4th,,b,b,20,,,,,
4,5,4th,2.99,a,,20,c,14.0,210.0,,49.0
5,6,3rd,2.45,b,a,0,c,0.0,194.0,112.0,54.0
6,7,2nd,5.99,,e,20,b,0.0,101.0,87.0,130.0
7,8,,5.99,,,20,a,7.0,298.0,54.0,
8,9,1st,3.0,a,a,5,a,14.0,187.0,130.0,23.0
9,10,3rd,,c,d,20,a,30.0,103.0,105.0,152.0


# Joining after Grouping

In [152]:
mean_cost_by_stock_code = stock.groupby('stock_code').agg(mean_cost_by_stock_code=('cost', 'mean'))
mean_cost_by_stock_code

Unnamed: 0_level_0,mean_cost_by_stock_code
stock_code,Unnamed: 1_level_1
a,5.66
b,2.45
c,2.99


Add for each item in stock a columns mean_cost_by_stock_code containing the mean cost for all items with the same stock_code

In [154]:
stock.merge(mean_cost_by_stock_code, how = 'left', on = 'stock_code')

Unnamed: 0,cost_class,cost,stock_code,priority_code,tax_rate,mean_cost_by_stock_code
0,1st,10.99,a,,0,5.66
1,2nd,,a,,0,5.66
2,3rd,2.99,c,a,20,2.99
3,4th,,b,b,20,2.45
4,4th,2.99,a,,20,5.66
5,3rd,2.45,b,a,0,2.45
6,2nd,5.99,,e,20,
7,,5.99,,,20,
8,1st,3.0,a,a,5,5.66
9,3rd,,c,d,20,2.99


# . transform()

Let's use it to add a new column sum_cost_by_cost_class which will contain, for each item, the sum of the costs of items in the same cost_class as itself

In [160]:
stock.loc[:, 'sum_cost_by_cost_class'] = \
stock.groupby('cost_class').cost.transform('sum')

stock

Unnamed: 0_level_0,cost_class,cost,stock_code,priority_code,tax_rate,sum_cost_by_cost_class
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1st,10.99,a,,0,13.99
2,2nd,,a,,0,5.99
3,3rd,2.99,c,a,20,5.44
4,4th,,b,b,20,2.99
5,4th,2.99,a,,20,2.99
6,3rd,2.45,b,a,0,5.44
7,2nd,5.99,,e,20,5.99
8,,5.99,,,20,
9,1st,3.0,a,a,5,13.99
10,3rd,,c,d,20,5.44


# 12. Tidy Data

In [162]:
sales.head()

Unnamed: 0,item_number,target_class,days_in_reduction,days_sales_0_50,days_sales_51_100,days_sales_101plus
0,1,a,0,120,141.0,99.0
1,2,b,7,19,341.0,
2,3,b,14,282,22.0,16.0
3,5,c,14,210,,49.0
4,6,c,0,194,112.0,54.0


## Melting: Wide to Long

* id_vars = these are the columns that WONT be melted
* var_name = new column name for your columns
* value_name = new column name for your values

In [179]:
sales_melted = sales.melt(
    id_vars = ['item_number', 'target_class', 'days_in_reduction'],
    var_name = 'days_sales_class',
    value_name = 'no_of_days'
)

sales_melted.head()

Unnamed: 0,item_number,target_class,days_in_reduction,days_sales_class,no_of_days
0,1,a,0,days_sales_0_50,120
1,2,b,7,days_sales_0_50,19
2,3,b,14,days_sales_0_50,282
3,5,c,14,days_sales_0_50,210
4,6,c,0,days_sales_0_50,194


In [172]:
sales_melted.loc[:, 'days_sales_class'] = \
sales_melted.days_sales_class.replace({
    'days_sales_0_50':'0-50',
    'days_sales_51_100' : '51-100',
    'days_sales_101plus' : '101+'
})

sales_melted

Unnamed: 0,item_number,target_class,days_in_reduction,days_sales_class,no_of_days
0,1,a,0,0-50,120.0
1,2,b,7,0-50,19.0
2,3,b,14,0-50,282.0
3,5,c,14,0-50,210.0
4,6,c,0,0-50,194.0
5,7,b,0,0-50,101.0
6,8,a,7,0-50,298.0
7,9,a,14,0-50,187.0
8,10,a,30,0-50,103.0
9,1,a,0,51-100,141.0


## Pivoting: Long to Wide

* index = columns that WONT be pivoted
* columns = new column headers
* values = where the data is coming on

In [180]:
sales_pivoted = sales_melted.pivot(
    index = ['item_number', 'target_class', 'days_in_reduction'],
    columns = 'days_sales_class',
    values = 'no_of_days'
)

sales_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,days_sales_class,days_sales_0_50,days_sales_101plus,days_sales_51_100
item_number,target_class,days_in_reduction,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,0,120,99.0,141.0
2,b,7,19,,341.0
3,b,14,282,16.0,22.0
5,c,14,210,49.0,
6,c,0,194,54.0,112.0
7,b,0,101,130.0,87.0
8,a,7,298,,54.0
9,a,14,187,23.0,130.0
10,a,30,103,152.0,105.0


KeyError: 'days_sales_class'