In [528]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
import scipy.stats
import statistics
from scipy.stats import sem
from scipy.stats import t


# Problem: What is the number of shoes that are likely to be sold, based on historical data

* Theoretical Problem: examine the problem of having too much inventory on-hand for this given store. (Sample Data)
* Find the 95% confidence interval using the last 12 months of sales 
* only for men's shoes
* only for the US (since both of these are identical in both genders and locations)

* Examining data for Al Bundy's Shoe Shop

In [529]:
sales_data = pd.read_csv('/Users/joseservin/DataCamp/Projects/Stats_for_DataScience/sales_data.csv')
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14967 entries, 0 to 14966
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   InvoiceNo      14967 non-null  int64  
 1   Date           14967 non-null  object 
 2   Country        14967 non-null  object 
 3   ProductID      14967 non-null  int64  
 4   Shop           14967 non-null  object 
 5   Gender         14967 non-null  object 
 6   Size (US)      14967 non-null  float64
 7   Size (Europe)  14967 non-null  object 
 8   Size (UK)      14967 non-null  float64
 9   UnitPrice      14967 non-null  object 
 10  Discount       14967 non-null  object 
 11  Year           14967 non-null  int64  
 12  Month          14967 non-null  int64  
 13  SalePrice      14967 non-null  object 
dtypes: float64(2), int64(4), object(8)
memory usage: 1.6+ MB


In [530]:
sales_data.head()

Unnamed: 0,InvoiceNo,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,Year,Month,SalePrice
0,52389,1/1/14,United Kingdom,2152,UK2,Male,11.0,44,10.5,$159.00,0%,2014,1,$159.00
1,52390,1/1/14,United States,2230,US15,Male,11.5,44-45,11.0,$199.00,20%,2014,1,$159.20
2,52391,1/1/14,Canada,2160,CAN7,Male,9.5,42-43,9.0,$149.00,20%,2014,1,$119.20
3,52392,1/1/14,United States,2234,US6,Female,9.5,40,7.5,$159.00,0%,2014,1,$159.00
4,52393,1/1/14,United Kingdom,2222,UK4,Female,9.0,39-40,7.0,$159.00,0%,2014,1,$159.00


* To get to know the data, look at sub-groups 

* Segment the data by 
    * shoe size
    * country 
    * gender
    

# Creating mens_data DF

In [531]:
mens_data = sales_data.query('Gender == "Male"')

In [532]:
mens_data.head()

Unnamed: 0,InvoiceNo,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,Year,Month,SalePrice
0,52389,1/1/14,United Kingdom,2152,UK2,Male,11.0,44,10.5,$159.00,0%,2014,1,$159.00
1,52390,1/1/14,United States,2230,US15,Male,11.5,44-45,11.0,$199.00,20%,2014,1,$159.20
2,52391,1/1/14,Canada,2160,CAN7,Male,9.5,42-43,9.0,$149.00,20%,2014,1,$119.20
5,52394,1/1/14,United States,2173,US15,Male,10.5,43-44,10.0,$159.00,0%,2014,1,$159.00
7,52396,1/2/14,Canada,2238,CAN5,Male,10.0,43,9.5,$169.00,0%,2014,1,$169.00


In [533]:
mens_data.pivot_table(columns='Country', index='Size (US)', values= ['Size (UK)'], aggfunc='count', margins=True)

Unnamed: 0_level_0,Size (UK),Size (UK),Size (UK),Size (UK),Size (UK)
Country,Canada,Germany,United Kingdom,United States,All
Size (US),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
6.0,15,30,6,54,105
6.5,15,18,12,45,90
7.0,24,30,21,39,114
7.5,45,48,12,66,171
8.0,51,117,45,141,354
8.5,192,174,87,225,678
9.0,324,348,183,492,1347
9.5,375,549,225,741,1890
10.0,237,411,156,543,1347
10.5,243,453,150,462,1308


# Creating Women's DataFrame

In [534]:
womens_data = sales_data.query('Gender == "Female"')

In [535]:
womens_data.pivot_table(columns='Country', index='Size (US)', values= ['Size (UK)'], aggfunc='count', margins=True)

Unnamed: 0_level_0,Size (UK),Size (UK),Size (UK),Size (UK),Size (UK)
Country,Canada,Germany,United Kingdom,United States,All
Size (US),Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
4.5,6,9,15,21,51
5.0,6,12,9,9,36
5.5,6,9,6,42,63
6.0,21,15,12,33,81
6.5,51,84,24,93,252
7.0,93,156,27,147,423
7.5,153,222,87,318,780
8.0,192,324,168,618,1302
8.5,171,339,129,399,1038
9.0,213,264,93,384,954


# Looking at mens data in the US for year 2016

In [536]:
mens_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8919 entries, 0 to 14964
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   InvoiceNo      8919 non-null   int64  
 1   Date           8919 non-null   object 
 2   Country        8919 non-null   object 
 3   ProductID      8919 non-null   int64  
 4   Shop           8919 non-null   object 
 5   Gender         8919 non-null   object 
 6   Size (US)      8919 non-null   float64
 7   Size (Europe)  8919 non-null   object 
 8   Size (UK)      8919 non-null   float64
 9   UnitPrice      8919 non-null   object 
 10  Discount       8919 non-null   object 
 11  Year           8919 non-null   int64  
 12  Month          8919 non-null   int64  
 13  SalePrice      8919 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 1.0+ MB


In [537]:
mens_2016 = mens_data.query('Year == 2016 & Country == "United States"')

In [538]:
mens_2016.head()

Unnamed: 0,InvoiceNo,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,Year,Month,SalePrice
7617,59213,1/2/16,United States,2228,US13,Male,9.5,42-43,9.0,$179.00,30%,2016,1,$125.30
7619,59214,1/2/16,United States,2236,US6,Male,8.5,41-42,8.0,$189.00,20%,2016,1,$151.20
7629,59223,1/3/16,United States,2158,US3,Male,8.0,41,7.5,$159.00,0%,2016,1,$159.00
7631,59225,1/3/16,United States,2236,US13,Male,8.0,41,7.5,$129.00,0%,2016,1,$129.00
7632,59226,1/3/16,United States,2207,US14,Male,14.0,47,13.5,$169.00,0%,2016,1,$169.00


In [539]:
mens_2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1677 entries, 7617 to 14961
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   InvoiceNo      1677 non-null   int64  
 1   Date           1677 non-null   object 
 2   Country        1677 non-null   object 
 3   ProductID      1677 non-null   int64  
 4   Shop           1677 non-null   object 
 5   Gender         1677 non-null   object 
 6   Size (US)      1677 non-null   float64
 7   Size (Europe)  1677 non-null   object 
 8   Size (UK)      1677 non-null   float64
 9   UnitPrice      1677 non-null   object 
 10  Discount       1677 non-null   object 
 11  Year           1677 non-null   int64  
 12  Month          1677 non-null   int64  
 13  SalePrice      1677 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 196.5+ KB


In [540]:
mens_2016['Month'] = mens_2016['Month'].astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mens_2016['Month'] = mens_2016['Month'].astype('object')


In [541]:
mens_2016_pivot = mens_2016.pivot_table(columns='Month', index='Size (US)', aggfunc='size', fill_value=0)
mens_2016_pivot

Month,1,2,3,4,5,6,7,8,9,10,11,12
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6.0,4,1,3,1,3,3,3,4,3,7,3,0
6.5,3,2,0,1,0,0,1,7,2,1,2,1
7.0,0,0,1,0,6,4,4,2,3,0,0,0
7.5,3,2,3,1,7,0,7,3,4,6,1,1
8.0,7,9,7,3,12,2,9,4,7,5,2,6
8.5,12,12,8,8,15,9,17,17,6,9,10,6
9.0,17,13,13,11,21,22,25,30,26,25,13,10
9.5,19,25,27,24,26,33,25,47,31,44,37,26
10.0,17,26,26,19,16,31,25,24,23,31,15,20
10.5,13,16,22,14,28,19,18,15,19,21,16,10


In [542]:
mens_2016_pivot['mean'] = mens_2016_pivot.mean(axis=1)
mens_2016_pivot

Month,1,2,3,4,5,6,7,8,9,10,11,12,mean
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6.0,4,1,3,1,3,3,3,4,3,7,3,0,2.916667
6.5,3,2,0,1,0,0,1,7,2,1,2,1,1.666667
7.0,0,0,1,0,6,4,4,2,3,0,0,0,1.666667
7.5,3,2,3,1,7,0,7,3,4,6,1,1,3.166667
8.0,7,9,7,3,12,2,9,4,7,5,2,6,6.083333
8.5,12,12,8,8,15,9,17,17,6,9,10,6,10.75
9.0,17,13,13,11,21,22,25,30,26,25,13,10,18.833333
9.5,19,25,27,24,26,33,25,47,31,44,37,26,30.333333
10.0,17,26,26,19,16,31,25,24,23,31,15,20,22.75
10.5,13,16,22,14,28,19,18,15,19,21,16,10,17.583333


In [543]:
mens_2016_pivot.loc['Total'] = mens_2016_pivot.sum()

In [544]:
mens_2016_pivot

Month,1,2,3,4,5,6,7,8,9,10,11,12,mean
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6.0,4.0,1.0,3.0,1.0,3.0,3.0,3.0,4.0,3.0,7.0,3.0,0.0,2.916667
6.5,3.0,2.0,0.0,1.0,0.0,0.0,1.0,7.0,2.0,1.0,2.0,1.0,1.666667
7.0,0.0,0.0,1.0,0.0,6.0,4.0,4.0,2.0,3.0,0.0,0.0,0.0,1.666667
7.5,3.0,2.0,3.0,1.0,7.0,0.0,7.0,3.0,4.0,6.0,1.0,1.0,3.166667
8.0,7.0,9.0,7.0,3.0,12.0,2.0,9.0,4.0,7.0,5.0,2.0,6.0,6.083333
8.5,12.0,12.0,8.0,8.0,15.0,9.0,17.0,17.0,6.0,9.0,10.0,6.0,10.75
9.0,17.0,13.0,13.0,11.0,21.0,22.0,25.0,30.0,26.0,25.0,13.0,10.0,18.833333
9.5,19.0,25.0,27.0,24.0,26.0,33.0,25.0,47.0,31.0,44.0,37.0,26.0,30.333333
10.0,17.0,26.0,26.0,19.0,16.0,31.0,25.0,24.0,23.0,31.0,15.0,20.0,22.75
10.5,13.0,16.0,22.0,14.0,28.0,19.0,18.0,15.0,19.0,21.0,16.0,10.0,17.583333


# Statistical Calcs for Confidence Intervals

* we will calculate one confidence interval for each size

* 2016 Data
    * n = 12
    * T-Statistic is for 11 D.F and a/2 = 0.025 (T 11,0.025) = 2.20

In [545]:
mens_2016_pivot

Month,1,2,3,4,5,6,7,8,9,10,11,12,mean
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6.0,4.0,1.0,3.0,1.0,3.0,3.0,3.0,4.0,3.0,7.0,3.0,0.0,2.916667
6.5,3.0,2.0,0.0,1.0,0.0,0.0,1.0,7.0,2.0,1.0,2.0,1.0,1.666667
7.0,0.0,0.0,1.0,0.0,6.0,4.0,4.0,2.0,3.0,0.0,0.0,0.0,1.666667
7.5,3.0,2.0,3.0,1.0,7.0,0.0,7.0,3.0,4.0,6.0,1.0,1.0,3.166667
8.0,7.0,9.0,7.0,3.0,12.0,2.0,9.0,4.0,7.0,5.0,2.0,6.0,6.083333
8.5,12.0,12.0,8.0,8.0,15.0,9.0,17.0,17.0,6.0,9.0,10.0,6.0,10.75
9.0,17.0,13.0,13.0,11.0,21.0,22.0,25.0,30.0,26.0,25.0,13.0,10.0,18.833333
9.5,19.0,25.0,27.0,24.0,26.0,33.0,25.0,47.0,31.0,44.0,37.0,26.0,30.333333
10.0,17.0,26.0,26.0,19.0,16.0,31.0,25.0,24.0,23.0,31.0,15.0,20.0,22.75
10.5,13.0,16.0,22.0,14.0,28.0,19.0,18.0,15.0,19.0,21.0,16.0,10.0,17.583333


In [546]:
mens_2016_pivot = mens_2016.pivot_table(columns='Month', index='Size (US)', aggfunc='size', fill_value=0)
mens_2016_pivot

Month,1,2,3,4,5,6,7,8,9,10,11,12
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6.0,4,1,3,1,3,3,3,4,3,7,3,0
6.5,3,2,0,1,0,0,1,7,2,1,2,1
7.0,0,0,1,0,6,4,4,2,3,0,0,0
7.5,3,2,3,1,7,0,7,3,4,6,1,1
8.0,7,9,7,3,12,2,9,4,7,5,2,6
8.5,12,12,8,8,15,9,17,17,6,9,10,6
9.0,17,13,13,11,21,22,25,30,26,25,13,10
9.5,19,25,27,24,26,33,25,47,31,44,37,26
10.0,17,26,26,19,16,31,25,24,23,31,15,20
10.5,13,16,22,14,28,19,18,15,19,21,16,10


In [547]:
df_list = mens_2016_pivot.values.tolist()
df_list

[[4, 1, 3, 1, 3, 3, 3, 4, 3, 7, 3, 0],
 [3, 2, 0, 1, 0, 0, 1, 7, 2, 1, 2, 1],
 [0, 0, 1, 0, 6, 4, 4, 2, 3, 0, 0, 0],
 [3, 2, 3, 1, 7, 0, 7, 3, 4, 6, 1, 1],
 [7, 9, 7, 3, 12, 2, 9, 4, 7, 5, 2, 6],
 [12, 12, 8, 8, 15, 9, 17, 17, 6, 9, 10, 6],
 [17, 13, 13, 11, 21, 22, 25, 30, 26, 25, 13, 10],
 [19, 25, 27, 24, 26, 33, 25, 47, 31, 44, 37, 26],
 [17, 26, 26, 19, 16, 31, 25, 24, 23, 31, 15, 20],
 [13, 16, 22, 14, 28, 19, 18, 15, 19, 21, 16, 10],
 [5, 16, 13, 10, 10, 11, 15, 8, 9, 7, 6, 7],
 [4, 3, 6, 3, 3, 5, 6, 4, 5, 12, 13, 5],
 [3, 0, 0, 4, 4, 4, 3, 12, 4, 9, 2, 1],
 [1, 1, 2, 0, 3, 2, 1, 0, 0, 4, 3, 2],
 [2, 6, 3, 3, 5, 3, 2, 1, 0, 1, 2, 1],
 [0, 0, 0, 1, 1, 0, 4, 0, 0, 0, 0, 2]]

In [548]:
row_list = mens_2016_pivot.values.tolist()
for i in row_list:
    row_std_dev = statistics.stdev(i)
    sqrt_n = sem(i)
    print(sqrt_n)

0.5143152749240509
0.5550502752731578
0.6071959063489053
0.6944949476585357
0.882990213519786
1.122260975428486
1.9688226497738759
2.447427014212884
1.5673032233551345
1.3676917810971916
1.0084867151060815
0.9623597890790968
1.0137937550497034
0.3785605378378481
0.4993682878083938
0.3553345272593508


## Mean_Confidence_Interval function

In [549]:
def mean_confidence_interval(df, confidence=0.95):
    """Compute the confidence interval row wise for DF"""
    row_list = df.values.tolist()
    con_level_list = []
    for i in row_list:
        mean = sum(i) / len(i)
        std_dev = statistics.stdev(i)
        dof = len(i) - 1
        t_crit = np.abs(t.ppf((1-confidence)/2,dof))
        con_level = (mean-std_dev*t_crit/np.sqrt(len(i)), mean+std_dev*t_crit/np.sqrt(len(i)))
        con_level_list.append(con_level)

    con_level_df = pd.DataFrame(con_level_list, columns=['Left Confidence Interval', 'Right Confidence Interval'])
    return con_level_df
con_level_df = mean_confidence_interval(mens_2016_pivot)

print(con_level_df)

    Left Confidence Interval  Right Confidence Interval
0                   1.784666                   4.048667
1                   0.445009                   2.888324
2                   0.330237                   3.003096
3                   1.638094                   4.695240
4                   4.139885                   8.026782
5                   8.279920                  13.220080
6                  14.499984                  23.166683
7                  24.946583                  35.720084
8                  19.300389                  26.199611
9                  14.573064                  20.593603
10                  7.530336                  11.969664
11                  3.631860                   7.868140
12                  1.601988                   6.064678
13                  0.750127                   2.416539
14                  1.317564                   3.515769
15                 -0.115419                   1.448753


In [550]:
con_level_df.set_index(mens_2016_pivot.index, inplace=True)

In [551]:
results = pd.concat([mens_2016_pivot, con_level_df], axis=1)

In [552]:
results

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,Left Confidence Interval,Right Confidence Interval
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6.0,4,1,3,1,3,3,3,4,3,7,3,0,1.784666,4.048667
6.5,3,2,0,1,0,0,1,7,2,1,2,1,0.445009,2.888324
7.0,0,0,1,0,6,4,4,2,3,0,0,0,0.330237,3.003096
7.5,3,2,3,1,7,0,7,3,4,6,1,1,1.638094,4.69524
8.0,7,9,7,3,12,2,9,4,7,5,2,6,4.139885,8.026782
8.5,12,12,8,8,15,9,17,17,6,9,10,6,8.27992,13.22008
9.0,17,13,13,11,21,22,25,30,26,25,13,10,14.499984,23.166683
9.5,19,25,27,24,26,33,25,47,31,44,37,26,24.946583,35.720084
10.0,17,26,26,19,16,31,25,24,23,31,15,20,19.300389,26.199611
10.5,13,16,22,14,28,19,18,15,19,21,16,10,14.573064,20.593603


* The upper bound of the Confidence Interval shows us the maximum number of pairs needed. 

In [553]:
results['Number_of_Pairs'] = np.ceil(results['Right Confidence Interval']).astype(int)

In [554]:
results

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,Left Confidence Interval,Right Confidence Interval,Number_of_Pairs
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6.0,4,1,3,1,3,3,3,4,3,7,3,0,1.784666,4.048667,5
6.5,3,2,0,1,0,0,1,7,2,1,2,1,0.445009,2.888324,3
7.0,0,0,1,0,6,4,4,2,3,0,0,0,0.330237,3.003096,4
7.5,3,2,3,1,7,0,7,3,4,6,1,1,1.638094,4.69524,5
8.0,7,9,7,3,12,2,9,4,7,5,2,6,4.139885,8.026782,9
8.5,12,12,8,8,15,9,17,17,6,9,10,6,8.27992,13.22008,14
9.0,17,13,13,11,21,22,25,30,26,25,13,10,14.499984,23.166683,24
9.5,19,25,27,24,26,33,25,47,31,44,37,26,24.946583,35.720084,36
10.0,17,26,26,19,16,31,25,24,23,31,15,20,19.300389,26.199611,27
10.5,13,16,22,14,28,19,18,15,19,21,16,10,14.573064,20.593603,21


# Problem: by how much one shop outperforms the other in terms of sales?

* using data for Women in the year 2016 for shops GER1 and GER2
* assuming the samples are independent
* we don't know population varience but can assume it is equal

In [555]:
sales_data.head()

Unnamed: 0,InvoiceNo,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,Year,Month,SalePrice
0,52389,1/1/14,United Kingdom,2152,UK2,Male,11.0,44,10.5,$159.00,0%,2014,1,$159.00
1,52390,1/1/14,United States,2230,US15,Male,11.5,44-45,11.0,$199.00,20%,2014,1,$159.20
2,52391,1/1/14,Canada,2160,CAN7,Male,9.5,42-43,9.0,$149.00,20%,2014,1,$119.20
3,52392,1/1/14,United States,2234,US6,Female,9.5,40,7.5,$159.00,0%,2014,1,$159.00
4,52393,1/1/14,United Kingdom,2222,UK4,Female,9.0,39-40,7.0,$159.00,0%,2014,1,$159.00


In [556]:
women_ger1 = sales_data.query('Gender == "Female" & Year == 2016 & Shop == "GER1"')

In [557]:
women_ger2 = sales_data.query('Gender == "Female" & Year == 2016 & Shop == "GER2"')

## Stats for women_ger1

In [558]:
women_ger1.head()

Unnamed: 0,InvoiceNo,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,Year,Month,SalePrice
7603,59199,1/1/16,Germany,2173,GER1,Female,8.5,39,6.5,$149.00,20%,2016,1,$119.20
7606,59202,1/2/16,Germany,2205,GER1,Female,7.5,38,5.5,$159.00,0%,2016,1,$159.00
7612,59208,1/2/16,Germany,2195,GER1,Female,8.5,39,6.5,$179.00,0%,2016,1,$179.00
7620,59215,1/2/16,Germany,2231,GER1,Female,8.0,38-39,6.0,$189.00,30%,2016,1,$132.30
7644,59236,1/4/16,Germany,2229,GER1,Female,6.5,37,4.5,$149.00,50%,2016,1,$74.50


In [559]:
women_ger1= women_ger1.pivot_table(columns='Month', index='Size (US)', aggfunc='size', fill_value=0)
women_ger1

Month,1,2,3,4,5,6,7,8,9,10,11,12
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4.5,0,0,0,0,1,3,0,0,0,0,1,0
5.0,0,0,0,0,0,0,2,0,0,0,0,0
5.5,0,0,0,0,0,0,0,0,0,0,1,0
6.0,0,2,0,0,0,0,0,0,0,0,0,0
6.5,3,3,1,2,1,0,2,0,2,1,3,4
7.0,0,3,3,4,1,0,1,0,2,0,0,1
7.5,1,2,4,1,2,6,4,3,5,8,2,1
8.0,6,10,3,9,1,3,6,8,3,12,3,9
8.5,10,10,10,7,14,4,7,7,4,8,7,9
9.0,1,3,8,6,3,1,4,4,0,2,4,2


## Function returning values needed to calculate pooled variance between two DataFrames

In [560]:
def stats_needed(df):
    """Compute the mean and sample variance of a given row from DF"""
    # initialize three new lists 
    n = []
    mean_list = []
    sample_var = []

    # iterate through rows of DF 
    row_list = df.values.tolist()
    for i in row_list:
        mean = sum(i) / len(i)
        mean_list.append(mean)

        i_var = statistics.variance(i)
        sample_var.append(i_var)

        n.append(len(i))
    
    # Build out DataFrames
    len_df = pd.DataFrame(n, columns=['sample_size'])
    mean_df = pd.DataFrame(mean_list, columns=['sample_mean'])
    var_df = pd.DataFrame(sample_var, columns=['sample_var'])

    # Return one DF with stats for calculation 
    stats_data = pd.concat([len_df,mean_df,var_df], axis=1)

    return stats_data



In [561]:
women_ger1_stats = stats_needed(women_ger1)
print(women_ger1_stats)

    sample_size  sample_mean  sample_var
0            12     0.416667    0.810606
1            12     0.166667    0.333333
2            12     0.083333    0.083333
3            12     0.166667    0.333333
4            12     1.833333    1.606061
5            12     1.250000    2.022727
6            12     3.250000    4.931818
7            12     6.083333   12.265152
8            12     8.083333    7.719697
9            12     3.166667    5.060606
10           12     2.500000    1.545455
11           12     0.833333    0.696970
12           12     1.250000    1.659091
13           12     0.166667    0.151515
14           12     0.166667    0.333333
15           12     0.250000    0.204545


In [562]:
women_ger1_stats.set_index(women_ger1.index, inplace=True)


## Stats for women_ger2

In [563]:
women_ger2= women_ger2.pivot_table(columns='Month', index='Size (US)', aggfunc='size', fill_value=0)
women_ger2

Month,1,2,3,4,5,6,7,8,9,10,11,12
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4.5,0,0,0,0,0,0,0,0,0,1,0,0
5.0,0,0,0,0,0,0,0,2,0,0,0,0
5.5,0,0,0,0,0,0,0,1,0,2,0,1
6.0,0,1,3,1,2,0,0,0,0,0,0,0
6.5,2,0,2,1,1,2,0,1,2,1,3,0
7.0,0,0,0,4,1,3,1,1,1,3,1,4
7.5,2,1,1,3,2,7,9,8,14,8,6,3
8.0,13,6,5,13,5,3,11,6,6,9,8,3
8.5,8,5,10,4,5,5,9,7,3,7,9,8
9.0,5,2,2,9,3,1,1,7,2,1,4,2


In [564]:
women_ger2_stats = stats_needed(women_ger2)
print(women_ger2_stats)

    sample_size  sample_mean  sample_var
0            12     0.083333    0.083333
1            12     0.166667    0.333333
2            12     0.333333    0.424242
3            12     0.583333    0.992424
4            12     1.250000    0.931818
5            12     1.583333    2.265152
6            12     5.333333   16.060606
7            12     7.333333   12.242424
8            12     6.666667    4.969697
9            12     3.250000    6.568182
10           12     1.916667    3.719697
11           12     0.750000    1.113636
12           12     0.500000    0.636364
13           12     0.333333    0.242424
14           12     0.500000    2.090909
15           12     0.416667    0.446970


In [566]:
women_ger2_stats.set_index(women_ger2.index, inplace=True)
sample_stats = women_ger1_stats.merge(women_ger2_stats, on='Size (US)', suffixes=('_ger1','_ger2'))
sample_stats

Unnamed: 0_level_0,sample_size_ger1,sample_mean_ger1,sample_var_ger1,sample_size_ger2,sample_mean_ger2,sample_var_ger2
Size (US),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4.5,12,0.416667,0.810606,12,0.083333,0.083333
5.0,12,0.166667,0.333333,12,0.166667,0.333333
5.5,12,0.083333,0.083333,12,0.333333,0.424242
6.0,12,0.166667,0.333333,12,0.583333,0.992424
6.5,12,1.833333,1.606061,12,1.25,0.931818
7.0,12,1.25,2.022727,12,1.583333,2.265152
7.5,12,3.25,4.931818,12,5.333333,16.060606
8.0,12,6.083333,12.265152,12,7.333333,12.242424
8.5,12,8.083333,7.719697,12,6.666667,4.969697
9.0,12,3.166667,5.060606,12,3.25,6.568182
