# MODULE 3 - Basis of Statistical Reasoning

## 3.1.1 Probability Example

> P(P) = (# of transactions where P was bought)/ n

In [1]:
import pandas as pd

In [23]:
# Suppose 'transactions' is a pandas DF, where each row is a transaction
# and there is a col named 'products' where it lists all the products in that
# transaction
transactions = pd.read_csv('example_files/transactions.csv')
print(transactions)

# Count total # of transactions
total_transactions = len(transactions)
print('Total transactions = ',total_transactions)

# Just out of curiosity to learn how to pick out elements of a DF
for i,j in transactions.iterrows():
    if 'P' in j['products']:
        print(i, j)


# Count # of transactions where P was bought
product_p_transactions = transactions['products'].apply(lambda x: 'P' in x).sum()
print('Product P transactions = ', product_p_transactions)

# Calculate probability
p_p = product_p_transactions / total_transactions
print('P(P) = ', p_p)

  products
0    A,B,P
1      C,D
2        P
3    A,P,D
4        E
5      B,C
6      P,E
7        A
8      D,P
Total transactions =  9
row:  0 products    A,B,P
Name: 0, dtype: object
row:  2 products    P
Name: 2, dtype: object
row:  3 products    A,P,D
Name: 3, dtype: object
row:  6 products    P,E
Name: 6, dtype: object
row:  8 products    D,P
Name: 8, dtype: object
Product P transactions =  5
P(P) =  0.5555555555555556


## 3.1.2.1 Sample Space

In [24]:
import numpy as np

In [31]:
ecommerce_data = pd.read_csv('example_files/ecommerce.csv')
print(ecommerce_data)

product_sample_space = ecommerce_data['product'].unique()
print(product_sample_space)

num_possible_outcomes = len(product_sample_space)
print('Number of possible outcomes = ', num_possible_outcomes)

amount_spent_sample_space = np.linspace(ecommerce_data['amount_spent'].min(), ecommerce_data['amount_spent'].max(), 1000)
# print('Amount spent space = ', amount_spent_sample_space)

length_of_sample_space = amount_spent_sample_space[-1] - amount_spent_sample_space[0]
print('Length of sample space = ', length_of_sample_space)

event_product_A = ecommerce_data[ecommerce_data['product'] == 'A']
print('Event product A = ', event_product_A)

event_spent_over_100 = ecommerce_data[ecommerce_data['amount_spent'] > 100]
print('Event spent over 100 = ', event_spent_over_100)




  product  amount_spent
0       A           120
1       B            80
2       C           200
3       A            90
4       D           150
5       B            60
6       E           300
7       A           100
8       C            50
9       D           130
['A' 'B' 'C' 'D' 'E']
Number of possible outcomes =  5
Length of sample space =  250.0
Event product A =    product  amount_spent
0       A           120
3       A            90
7       A           100
Event spent over 100 =    product  amount_spent
0       A           120
2       C           200
4       D           150
6       E           300
9       D           130
