In [488]:
import pandas as pd
import numpy as np

### CASE STUDY 1: Customer Purchases

> Concept: Basic Probability, Conditional Probability, Independence

> Scenario: You work for an e-commerce company. You have data on customersâ€™ gender, whether they saw an ad, and whether they made a purchase.

> **Practice Questions:**

**1. Easy:** Find the probability a customer made a purchase.

**2. Intermediate:** Find $P(\text{Purchased} \mid \text{Saw Ad})$.

**3. Hard:** Check if â€˜seeing the adâ€™ and â€˜purchasingâ€™ are independent events.

**4. Advanced:** What is the probability that a randomly selected customer is female and made a purchase, given that she saw the ad?  
*Express as* $P(\text{Female} \cap \text{Purchased} \mid \text{Saw Ad})$.

**5. Advanced:** Among customers who did **not** see the ad, what is the probability that they made a purchase?  
*Express as* $P(\text{Purchased} \mid \text{Did Not See Ad})$.

**6. Challenge:** What is the odds ratio of purchasing for customers who saw the ad versus those who did not see the ad?  
*Hint: Odds ratio = (P(Purchased|Saw Ad) / (1 - P(Purchased|Saw Ad))) $\div$ (P(Purchased|Did Not See Ad) / (1 - P(Purchased|Did Not See Ad)))*


In [489]:
customer_purchase_data = pd.read_excel("C:/Users/leste/OneDrive/Desktop/Microsoft VS Code/Github/Data-Analytics-Projects-/Data Files/Female_&_Male_Purchases.xlsx", sheet_name="Customer_Purchases")

In [490]:
customer_purchase_data

Unnamed: 0,Customer_ID,Gender,Saw_Ad,Purchased
0,1,Male,Yes,Yes
1,2,Female,No,No
2,3,Male,Yes,No
3,4,Male,Yes,No
4,5,Male,No,No
...,...,...,...,...
195,196,Female,Yes,No
196,197,Female,No,No
197,198,Female,Yes,Yes
198,199,Male,No,No


> Solution 1

In [491]:
customer_purchase_data['Purchased'].value_counts()

No     141
Yes     59
Name: Purchased, dtype: int64

In [492]:
Customer_purchased = customer_purchase_data.query('Purchased == "Yes"')['Purchased'].count()
Customer_purchased

59

In [493]:
Total_customer_count = customer_purchase_data['Customer_ID'].count()
Total_customer_count

200

In [494]:
Probability_of_purchase = Customer_purchased / Total_customer_count
Probability_of_purchase

0.295

> Solution 2

In [495]:
Customer_saw_ad_purchase = customer_purchase_data.query('Saw_Ad == "Yes" and Purchased == "Yes"')['Purchased'].count()
Customer_saw_ad_purchase

37

In [496]:
Customer_saw_ad = customer_purchase_data.query('Saw_Ad == "Yes"')['Saw_Ad'].count()
Customer_saw_ad

114

In [497]:
Probability_of_saw_ad_purchase = Customer_saw_ad_purchase / Customer_saw_ad
Probability_of_saw_ad_purchase

0.32456140350877194

> Solution 3 

1. Checking the product of the probabilities

2. Checking the probability of the events happening together

> If 1 & 2 then the events are independant

In [498]:
Probability_of_saw_ad = Customer_saw_ad / Total_customer_count
Probability_of_saw_ad

0.57

In [499]:
Probability_of_purchase

0.295

In [500]:
Probability_of_saw_ad_purchase

0.32456140350877194

> Both are not independant events

>  Probability of seeing of Ad is not equal to the Probability of seeing of Ad provided he purchased
which means the ad has likely influenced the purchase

"""
ODDS RATIO â€” CONCEPTUAL LOGIC

The odds ratio (OR) measures how strongly an exposure or condition 
is associated with an outcome.

It compares the odds of the outcome occurring in one group 
to the odds of it occurring in another group.

------------------------------------------------------------
General 2x2 setup:

                Outcome = Yes     Outcome = No
Group 1 (Exposed)        a               b
Group 2 (Unexposed)      c               d

------------------------------------------------------------
Step 1: Compute odds for each group

    Odds(Group 1) = a / b
    Odds(Group 2) = c / d

Step 2: Compute the odds ratio

    OR = (a/b) / (c/d)
       = (a * d) / (b * c)

------------------------------------------------------------
Interpretation:

- OR = 1  â†’ No association between exposure and outcome
- OR > 1  â†’ Exposure increases the odds of the outcome
- OR < 1  â†’ Exposure decreases the odds of the outcome

------------------------------------------------------------
Key Logic:

1. The "odds" quantify how likely an event is to happen versus not happen
   within each group.

2. The "odds ratio" compares these odds across two groups to determine 
   the strength and direction of association.

3. One condition (the outcome) is held constant, while comparing 
   how group membership affects the odds of that outcome.
"""


> Solution 4 

In [501]:
Female_and_purchased_also_ad = customer_purchase_data.query('Purchased == "Yes" and Gender == "Female" and Saw_Ad == "Yes"')['Purchased'].count()
Female_and_purchased_also_ad

17

In [502]:
People_didnt_see_ad = customer_purchase_data.query('Saw_Ad == "No"')['Saw_Ad'].count()
People_didnt_see_ad

86

In [503]:
# Probability of Female and Purchased
Female_and_purchased_also_ad / (Total_customer_count - People_didnt_see_ad)

0.14912280701754385

> Solution 5

In [504]:
Purchased_and_didnt_see_ad = customer_purchase_data.query('Purchased == "Yes" and Saw_Ad == "No"')['Purchased'].count()
Purchased_and_didnt_see_ad

22

In [505]:
People_saw_ad = customer_purchase_data.query('Saw_Ad == "Yes"')['Saw_Ad'].count()
People_saw_ad

114

In [506]:
#Probability of Purchased given didn't see ad (So the people who didn't see ad is a condition that needs to be applied)
Purchased_and_didnt_see_ad / (Total_customer_count - People_saw_ad)

0.2558139534883721

### ðŸŽ¾ Play Tennis â€” Decision-Based Questions

> **Q1**

If the feature of **Today** =  
`(Outlook: Sunny, Temp: Hot, Humidity: Normal, Windy: False)`  
â†’ Will the player play tennis or not?

---

> **Q2**

If the feature of **Today** =  
`(Outlook: Rainy, Temp: Mild, Humidity: High, Windy: True)`  
â†’ Will the player play tennis or not?



In [507]:
table_tennis_data = pd.read_excel("C:/Users/leste/OneDrive/Desktop/Microsoft VS Code/Github/Data-Analytics-Projects-/Data Files/play_tennis_bayes_with_explanation.xlsx" ,names = ['Outlook','Temp','Humidity','Windy','PlayTennis'])

In [508]:
table_tennis_data

Unnamed: 0,Outlook,Temp,Humidity,Windy,PlayTennis
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


> Probability of players who are going to play tennis


In [509]:
outlook = pd.crosstab(index = table_tennis_data['Outlook'], columns = table_tennis_data['PlayTennis'], margins = True)
outlook

PlayTennis,No,Yes,All
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Overcast,0,15,15
Rainy,8,10,18
Sunny,10,7,17
All,18,32,50


In [510]:
sunny_outlook = outlook.loc['Sunny', 'Yes'] / outlook.loc['All', 'Yes']
sunny_outlook


0.21875

In [511]:
total_sunny = outlook.loc['Sunny', 'All'] / outlook.loc['All', 'All']
total_sunny

0.34

In [512]:
windy = pd.crosstab(index = table_tennis_data['Windy'], columns = table_tennis_data['PlayTennis'], margins = True)
windy

PlayTennis,No,Yes,All
Windy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,8,18,26
True,10,14,24
All,18,32,50


In [513]:
windy_false = windy.iloc[0,1] / windy.iloc[2,1]
windy_false

0.5625

In [514]:
total_windy_false = windy.iloc[0,2]/windy.iloc[2,2]
total_windy_false

0.52

In [515]:
temp = pd.crosstab(index = table_tennis_data['Temp'], columns = table_tennis_data['PlayTennis'], margins = True)
temp

PlayTennis,No,Yes,All
Temp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cool,6,9,15
Hot,7,10,17
Mild,5,13,18
All,18,32,50


In [516]:
hot_temp = temp.loc['Hot', 'Yes'] / temp.loc['All', 'Yes']
hot_temp

0.3125

In [517]:
total_hot_condition = temp.loc['Hot', 'All'] / temp.loc['All', 'All']
total_hot_condition

0.34

In [518]:
humidity = pd.crosstab(table_tennis_data['Humidity'], columns = table_tennis_data['PlayTennis'],margins = True)
humidity

PlayTennis,No,Yes,All
Humidity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,16,6,22
Normal,2,26,28
All,18,32,50


In [519]:
total_normal_humidity = humidity.loc['Normal', 'All']/humidity.loc['All','All']
total_normal_humidity

0.56

In [520]:
normal_humidity = humidity.loc['Normal', 'Yes'] / humidity.loc['All','Yes']
normal_humidity

0.8125

In [521]:
table_tennis_yes = table_tennis_data[['PlayTennis']].query('PlayTennis == "Yes"').count()[0]/table_tennis_data.count()[0]
table_tennis_yes

0.64

In [522]:
probability_yes_provided_condition = sunny_outlook * windy_false * normal_humidity * table_tennis_yes * hot_temp
probability_yes_provided_condition

0.019995117187499998

In [523]:
probability_total_condition = total_sunny * total_hot_condition * total_normal_humidity * total_windy_false
probability_total_condition

0.03366272000000001

In [524]:
probability_yes_provided_condition / probability_total_condition

0.5939840032980102