# MODULE 3 - Basis of Statistical Reasoning

## 3.1.1 Probability Example

> P(P) = (# of transactions where P was bought)/ n

In [1]:
import pandas as pd

In [23]:
# Suppose 'transactions' is a pandas DF, where each row is a transaction
# and there is a col named 'products' where it lists all the products in that
# transaction
transactions = pd.read_csv('example_files/transactions.csv')
print(transactions)

# Count total # of transactions
total_transactions = len(transactions)
print('Total transactions = ',total_transactions)

# Just out of curiosity to learn how to pick out elements of a DF
for i,j in transactions.iterrows():
    if 'P' in j['products']:
        print(i, j)


# Count # of transactions where P was bought
product_p_transactions = transactions['products'].apply(lambda x: 'P' in x).sum()
print('Product P transactions = ', product_p_transactions)

# Calculate probability
p_p = product_p_transactions / total_transactions
print('P(P) = ', p_p)

  products
0    A,B,P
1      C,D
2        P
3    A,P,D
4        E
5      B,C
6      P,E
7        A
8      D,P
Total transactions =  9
row:  0 products    A,B,P
Name: 0, dtype: object
row:  2 products    P
Name: 2, dtype: object
row:  3 products    A,P,D
Name: 3, dtype: object
row:  6 products    P,E
Name: 6, dtype: object
row:  8 products    D,P
Name: 8, dtype: object
Product P transactions =  5
P(P) =  0.5555555555555556


## 3.1.2.1 Sample Space

In [24]:
import numpy as np

In [33]:
ecommerce_data = pd.read_csv('example_files/ecommerce.csv')
print(ecommerce_data)

product_sample_space = ecommerce_data['product'].unique()
print(product_sample_space, '\n')

num_possible_outcomes = len(product_sample_space)
print('Number of possible outcomes = ', num_possible_outcomes, '\n')

amount_spent_sample_space = np.linspace(ecommerce_data['amount_spent'].min(), ecommerce_data['amount_spent'].max(), 1000)
# print('Amount spent space = ', amount_spent_sample_space)

length_of_sample_space = amount_spent_sample_space[-1] - amount_spent_sample_space[0]
print('Length of sample space = ', length_of_sample_space, '\n')

event_product_A = ecommerce_data[ecommerce_data['product'] == 'A']
print('Event product A\n', event_product_A, '\n')

event_spent_over_100 = ecommerce_data[ecommerce_data['amount_spent'] > 100]
print('Event spent over 100\n', event_spent_over_100)




  product  amount_spent
0       A           120
1       B            80
2       C           200
3       A            90
4       D           150
5       B            60
6       E           300
7       A           100
8       C            50
9       D           130
['A' 'B' 'C' 'D' 'E'] 

Number of possible outcomes =  5 

Length of sample space =  250.0 

Event product A
   product  amount_spent
0       A           120
3       A            90
7       A           100 

Event spent over 100
   product  amount_spent
0       A           120
2       C           200
4       D           150
6       E           300
9       D           130


## 3.2.1.3 Independent Events

P(A ^ B) = P(A) * P(B) if A and B are independent events

In [43]:
transaction_data = pd.read_csv('example_files/ecommerce.csv')
print(transaction_data,'\n')

transaction_data = transaction_data.drop(columns='amount_spent')
print('Dropped amount_spent col\n', transaction_data,'\n')

transaction_data['id'] = transaction_data.index
print('Added id col\n', transaction_data,'\n')

event_A = transaction_data[transaction_data['product'] == 'A']
print('Event A\n', event_A, '\n')

event_B = transaction_data[transaction_data['product'] == 'B']
print('Event B\n', event_b, '\n')

p_A = len(event_A)/len(transaction_data)
print('P(A) = ', p_A)

p_B = len(event_B)/len(transaction_data)
print('P(B) = ', p_B)

# Calculate P(A and B) using inner join on id
p_A_and_B = len(pd.merge(event_A, event_B, on='id'))/len(transaction_data)
print('P(A and B) = ', p_A_and_B)

if p_A_and_B == p_A * p_B:
    print('P(A) and P(B) are independent')
else:
    print('P(A) and P(B) are not independent')


  product  amount_spent
0       A           120
1       B            80
2       C           200
3       A            90
4       D           150
5       B            60
6       E           300
7       A           100
8       C            50
9       D           130 

Dropped amount_spent col
   product
0       A
1       B
2       C
3       A
4       D
5       B
6       E
7       A
8       C
9       D 

Added id col
   product  id
0       A   0
1       B   1
2       C   2
3       A   3
4       D   4
5       B   5
6       E   6
7       A   7
8       C   8
9       D   9 

Event A
   product  id
0       A   0
3       A   3
7       A   7 

Event B
   product  id
1       B   1
5       B   5 

P(A) =  0.3
P(B) =  0.2
P(A and B) =  0.0
P(A) and P(B) are not independent


## 3.3.2 Construction of a Probability Space


In [55]:
transaction_data = pd.read_csv('example_files/transactions_probability_space.csv')
print(transaction_data,'\n')

sample_space = transaction_data['itemset'].unique()
print(sample_space, '\n')

event_A = transaction_data[transaction_data['total'] > 100]
print('Event A\n', event_A, '\n')

event_B = transaction_data[transaction_data['itemset'].str.contains('Fruit-Vegetable')]
print('Event B\n', event_B, '\n')

p_A = len(event_A)/len(transaction_data)
print('P(A) = ', p_A)

p_B = len(event_B)/len(transaction_data)
print('P(B) = ', p_B)

           itemset  total
0       Milk-Bread     45
1            Fruit    120
2        Vegetable    110
3            Fruit     80
4        Meat-Eggs    130
5       Bread-Milk     95
6  Fruit-Vegetable    150
7        Milk-Eggs     40
8           Cereal     50
9  Fruit-Vegetable     90 

['Milk-Bread' 'Fruit' 'Vegetable' 'Meat-Eggs' 'Bread-Milk'
 'Fruit-Vegetable' 'Milk-Eggs' 'Cereal'] 

Event A
            itemset  total
1            Fruit    120
2        Vegetable    110
4        Meat-Eggs    130
6  Fruit-Vegetable    150 

Event B
            itemset  total
6  Fruit-Vegetable    150
9  Fruit-Vegetable     90 

P(A) =  0.4
P(B) =  0.2


## 3.4.1 Kolmogorov Axiom


In [56]:
# Define sample space for a 6-side fair die
sample_space = np.array([1,2,3,4,5,6])
print(sample_space, '\n')

# Compute the probability of each result
probabilities = np.full(6, 1/6)
print(probabilities, '\n')

# Axiom 1 - Non-negativity
assert np.all(probabilities >= 0)

# Axiom 2 - Normality
assert np.isclose(np.sum(probabilities), 1)

# Event A - Obtaining an even number
event_A = sample_space[sample_space % 2 == 0]
print('Event A\n', event_A, '\n')

# Event B - Obtaining an odd number
event_B = sample_space[sample_space % 2 != 0]
print('Event B\n', event_B, '\n')

# Axiom 3 - Countable additivity
assert np.isclose(
    np.sum(probabilities[event_A - 1]) + np.sum(probabilities[event_B - 1]), 1)





[1 2 3 4 5 6] 

[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.16666667] 

Event A
 [2 4 6] 

Event B
 [1 3 5] 



## 3.4.2 Properties and Theorems

### Complement Property
$ P(A^c) = 1 - P(A) $

### Total Probability Theorem
$ P(B) = \sum_{i=1}^n [P(B|A_i) * P(A_i)] $
<br>
$ P(A) = P(A|B)*P(B) + P(A|\sim B)*P(\sim B) $

### Conditional Probability Theorem
$P(A|B) = \frac{P(A \cap B)}{P(B)}$ if P(B) > 0

In [66]:
# Generate random sales data
sales_data = np.random.choice([0,1], size=10000, p=[0.7, 0.3])
# print(sales_data, '\n')

# This turns the array into a boolean array where 0s are False and 1s are True
event_A = (sales_data == 1)
print('Event A\n', event_A, '\n')

event_B = np.random.choice([0,1], size=10000, p=[0.5,0.5])
event_B = event_B.astype(bool)


# Compute probabilities of events A and B, we use .mean in this case
# because the mean is the probability since we are adding 0s and 1s
P_A = np.mean(event_A)
print('P(A) = ', P_A)
print('P(~A) = ', np.mean(~event_A))

P_B = np.mean(event_B)
print('P(B) = ', P_B)
print('P(~B) = ', np.mean(~event_B))

P_A_given_B = np.mean(event_A[event_B])
print('P(A|B) = ', P_A_given_B)

P_A_given_notB = np.mean(event_A[~event_B])
print('P(A|~B) = ', P_A_given_notB)

# Compute the probability of intersection
# It checks if event_A[i] == event_B[i]
P_A_and_B = np.mean(event_A & event_B)
print('P(A and B) = ', P_A_and_B)

# Assert complementarity property
assert np.isclose(1 - P_A, np.mean(~event_A))
assert np.isclose(1 - P_B, np.mean(~event_B))
print('Complementarity property respected')

# Assert Total Probability Theorem
assert np.isclose( P_A, P_A_given_B*P_B + P_A_given_notB*(1-P_B) )
print('Law of Total Probability Successfully asserted!')

# Assert Conditional Probability Theorem
assert np.isclose(P_A_and_B/P_B , np.mean(event_A[event_B]))
print('Conditional Probability Theorem y Successfully asserted!')


Event A
 [ True False False ...  True  True  True] 

P(A) =  0.2934
P(~A) =  0.7066
P(B) =  0.4982
P(~B) =  0.5018
P(A|B) =  0.2958651144118828
P(A|~B) =  0.2909525707453169
P(A and B) =  0.1474
Complementarity property respected
Law of Total Probability Successfully asserted!
Conditional Probability Theorem y Successfully asserted!


## 3.5.1 Product Rule and Bayes Theorem

### Product Rule
Determines the joint probability of 2 events<br>
$ P(A , B) = P(A \cap B) = P(A)*P(B|A) = P(B)*P(A|B) $

### Bayes Theorem
$ P(A|B) = P(B|A)*\frac{P(A)}{P(B)} $