#  <font color='blue'> Review of probability theory using python </font> 
 
 
Install tabulate package: pip install tabulate on mac

We will use pandas to read a CSV file and to store data

Documentation of pandas https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [2]:
import pandas as pd
import numpy as np


## Download student-mat.csv from ICON

The csv file is downloaded from Kaggle 
https://www.kaggle.com/uciml/student-alcohol-consumption/data?select=student-mat.csv


In [3]:

df = pd.read_csv('student-mat.csv')
df.head(3)  # Select the first 3 rows of data


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


## Create a smaller data frame with only two columns

Grade A: G3  > 80%

Absences: high absences, if a student missed 10 or more classes.


In [18]:
newlist = pd.DataFrame()
newlist['Grade'] = np.where(df['G3']*5 >= 80, 1, 0)     # df short notation for data frame
newlist['Absences'] = np.where(df['absences'] >= 10, 1, 0)
newlist['count'] = 1
newlist.head(10)

Unnamed: 0,Grade,Absences,count
0,0,0,1
1,0,0,1
2,0,1,1
3,0,0,1
4,0,0,1
5,0,1,1
6,0,0,1
7,0,0,1
8,1,0,1
9,0,0,1


## Compute joint probabilities


In [21]:
Joint_table = pd.pivot_table(
    newlist, 
    values='count', 
    index=['Grade'], 
    columns=['Absences'], 
    aggfunc=np.size, 
    fill_value=0
)

print("Joint Histogram")
print("-------------------")
print(Joint_table)
print("-------------------")

Joint_table = Joint_table.to_numpy()
Joint_Probabilities = Joint_table/len(newlist)

print("Joint Probabilities")
print("-------------------")
print(Joint_Probabilities)
print("-------------------")


Joint Histogram
-------------------
Absences    0   1
Grade            
0         277  78
1          35   5
-------------------
Joint Probabilities
-------------------
[[0.70126582 0.19746835]
 [0.08860759 0.01265823]]
-------------------


## Compute marginal, and conditional probabilities

Marginal probability
$$P(X) = -\sum_j p(x_i,y_j)$$

Conditional probability P(X|Y=y): probability of x, if Y is fixed to a specific value
$$P(X|Y) = p(x,y)/p(y)$$


In [6]:
# P(A) Marginal probability of Absences: Sum along the grades axis (rows; axis = 0) 

PA = np.sum(Joint_Probabilities,axis=0)

print("Marginal probability of Absences")
print("-------------------")
print('P(A)', PA)
print("-------------------\n")


# Conditional probabilities of Grades, given A
# Broadcasting along the rows

PGgivenA = Joint_Probabilities/PA[None,:]

print("Conditional probability of Grades given Absences P(G|A)")
print("---------------------------------")
print(PGgivenA)
print("---------------------------------")


Marginal probability of Absences
-------------------
P(A) [0.78987342 0.21012658]
-------------------

Conditional probability of Grades given Absences P(G|A)
---------------------------------
[[0.88782051 0.93975904]
 [0.11217949 0.06024096]]
---------------------------------


## <font color=red> To do: Evaluate the probability of getting a grade A, if the student has been absent for more than ten times P(Grade=A|Absenses>=10)</font>

1. Compute 'P(Grade=A|Absenses>=10')

2. Compute 'P(Grade=A|Absenses<10')

3. Add the two; what do you expect to get if you add the two? i.e. P(Grade=A|Absenses>=10) + P(Grade=A|Absenses<10)

4. What will you get if you add P(Grade=A|Absenses>=10') and P(Grade<A|Absenses>=10')

In [28]:
# YOUR CODE HERE
# After calculating PGgivenA as done earlier in the assignment

# P(Grade=A | Absences>=10)
P_GradeA_given_Absences_ge10 = PGgivenA[1, 1]
print(f"P(Grade=A | Absences>=10): {P_GradeA_given_Absences_ge10}")

# P(Grade=A | Absences<10)
P_GradeA_given_Absences_lt10 = PGgivenA[1, 0]
print(f"P(Grade=A | Absences<10): {P_GradeA_given_Absences_lt10}")

print(" ================================================== PART 3 ================================================== ")
# Sum of probabilities
sum_prob = P_GradeA_given_Absences_ge10 + P_GradeA_given_Absences_lt10
print(f"P(Grade=A | Absences>=10) + P(Grade=A | Absences<10): {sum_prob}")

print(" ================================================== PART 4 ================================================== ")
# P(Grade<A | Absences>=10)
P_GradeA_and_notA_given_Absences_ge10 = PGgivenA[1, 1] + PGgivenA[0, 1]
print(f"P(Grade=A | Absences>=10) + P(Grade<A | Absences>=10): {P_GradeA_and_notA_given_Absences_ge10}")




P(Grade=A | Absences>=10): 0.060240963855421686
P(Grade=A | Absences<10): 0.11217948717948718
P(Grade=A | Absences>=10) + P(Grade=A | Absences<10): 0.17242045103490888
P(Grade=A | Absences>=10) + P(Grade<A | Absences>=10): 0.9999999999999999


## <font color=red> To do: compute the marginal probability P(g) and conditional probability P(A|G) </font>


In [29]:
# YOUR CODE HERE
# After calculating Joint_Probabilities as done earlier

# 1. Compute the marginal probability P(G)
# P(G) is the sum of Joint Probabilities over Absences (axis=1)
P_G = np.sum(Joint_Probabilities, axis=1)

print("Marginal Probability of Grades")
print("-------------------")
print('P(G)', P_G)
print("-------------------\n")

# 2. Compute the conditional probability P(A|G)
# P(A|G) is the Joint Probability divided by P(G)
P_A_given_G = Joint_Probabilities / P_G[:, None]

print("Conditional Probability of Absences given Grades P(A|G)")
print("---------------------------------")
print(P_A_given_G)
print("---------------------------------")



Marginal Probability of Grades
-------------------
P(G) [0.89873418 0.10126582]
-------------------

Conditional Probability of Absences given Grades P(A|G)
---------------------------------
[[0.21971831 0.78028169]
 [0.125      0.875     ]]
---------------------------------
