# Estimating distributions (part 1)
The goal of this notebook is to explore a first approach to approximate $p(y|x)$ and $p(x|y)$ on a tabular dataset where $x$ is discrete-valued, $x\in\mathbb{D}^k$, and the target $y$ is boolean, $y\in\{0,1\}$.

## Imports

In [100]:
import numpy as np
import pandas as pd

## Load data set

In [101]:
df = pd.read_csv('sample_data/tennis.csv', delimiter=',', header=0)
df

Unnamed: 0,Day,Outlook,Temp,Humidity,Wind,Tennis
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [102]:
df = df.drop('Day', axis=1)
df

Unnamed: 0,Outlook,Temp,Humidity,Wind,Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [34]:
X_names = df.columns.to_list()[:-1]
X_names

['Outlook', 'Temp', 'Humidity', 'Wind']

In [35]:
X = df.iloc[:,0:-1]
X

Unnamed: 0,Outlook,Temp,Humidity,Wind
0,Sunny,Hot,High,Weak
1,Sunny,Hot,High,Strong
2,Overcast,Hot,High,Weak
3,Rain,Mild,High,Weak
4,Rain,Cool,Normal,Weak
5,Rain,Cool,Normal,Strong
6,Overcast,Cool,Normal,Strong
7,Sunny,Mild,High,Weak
8,Sunny,Cool,Normal,Weak
9,Rain,Mild,Normal,Weak


In [36]:
Y_name = df.columns.to_list()[-1]
Y_name

'Tennis'

In [37]:
Y = df.iloc[:,-1]
Y

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: Tennis, dtype: object

## Build the table of observations
Take $x$ to be the random variable Outlook and count the observations based on the data set.

In [38]:
N = X['Outlook'].size
xvalues = np.unique(X['Outlook'].values).tolist()
yvalues = np.unique(Y.values).tolist()
dimx = len(xvalues)
dimy = len(yvalues)


In [39]:
obs = pd.DataFrame(0, columns=yvalues, index=xvalues)
for i in range(N):
    xi = X['Outlook'][i]
    yi = Y[i]
    obs[yi][xi] += 1
obs

Unnamed: 0,No,Yes
Overcast,0,4
Rain,2,3
Sunny,3,2


## Approximate the joint distribution $p(x,y)$
Take $x$ to be Outlook and approximate the joint distribution based on the table of observations.

In [40]:
m = obs.sum(axis=1)
m

Overcast    4
Rain        5
Sunny       5
dtype: int64

In [41]:
l = obs.sum(axis=0)
l

No     5
Yes    9
dtype: int64

In [42]:
obs['m'] = m
obs.loc['l'] = l
obs

Unnamed: 0,No,Yes,m
Overcast,0.0,4.0,4.0
Rain,2.0,3.0,5.0
Sunny,3.0,2.0,5.0
l,5.0,9.0,


In [43]:
joint_proba = pd.DataFrame(0, columns=yvalues, index=xvalues)
for x in xvalues:
    joint_proba.loc[x] = obs[yvalues].loc[x] / N
joint_proba

Unnamed: 0,No,Yes
Overcast,0.0,0.285714
Rain,0.142857,0.214286
Sunny,0.214286,0.142857


## Approximate $p(y|x)$
Take $x$ to be Outlook and estimate the conditional probability of $y$ given $x$. Then, sample 10 values of $y$ given $x$ equal Sunny.

In [44]:
p_y_x = pd.DataFrame(0, columns=yvalues, index=xvalues)
for x in xvalues:
    p_y_x.loc[x] = obs[yvalues].loc[x] / obs['m'].loc[x]
p_y_x

Unnamed: 0,No,Yes
Overcast,0.0,1.0
Rain,0.4,0.6
Sunny,0.6,0.4


In [45]:
np.random.choice(yvalues, size=10, p=p_y_x.loc['Sunny'])

array(['Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No'],
      dtype='<U3')

## Approximate $p(x|y)$
Take $x$ to be Outlook and approximate the conditional distribution based on the table of observations. Then, sample 10 values of Outlook for $y$ equal Yes.

In [46]:
p_x_y = pd.DataFrame(0, columns=yvalues, index=xvalues)
for y in yvalues:
    p_x_y[y] = obs[y] / obs[y].loc['l']
p_x_y

Unnamed: 0,No,Yes
Overcast,0.0,0.444444
Rain,0.4,0.333333
Sunny,0.6,0.222222


In [28]:
np.random.choice(xvalues, size=10, p=p_x_y['Yes'])

array(['Rain', 'Rain', 'Overcast', 'Overcast', 'Overcast', 'Overcast',
       'Sunny', 'Overcast', 'Overcast', 'Overcast'], dtype='<U8')

### P(y)

In [47]:
yes_count=0
for y in Y:
    if y =='Yes':
        yes_count+= 1



In [48]:
yes_count

9

In [52]:
total_rows = len(Y)
total_rows

14

In [152]:
p_yes= yes_count/total_rows
p_yes

0.6428571428571429

In [157]:
yvalues = ['Tennis']
xvalues = ['Yes', 'No']

In [163]:
p_y = pd.DataFrame(0, columns=xvalues, index=yvalues)
p_y

Unnamed: 0,Yes,No
Tennis,0,0


In [164]:
p_y['Yes'] = p_yes
p_y['No'] = 1 - p_yes
p_y

Unnamed: 0,Yes,No
Tennis,0.642857,0.357143


In [166]:
np.random.choice(xvalues, size=10, p=p_y.loc['Tennis'])

array(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
      dtype='<U3')

### P(h|y,o)

In [56]:
N_outlook = df['Outlook'].size
N_tennis = df['Tennis'].size

In [62]:
outlook_values = np.unique(X['Outlook'].values).tolist()
tennis_values = np.unique(df['Tennis'].values).tolist()
yvalues = np.unique(X['Humidity'].values).tolist()
outlook_values

['Overcast', 'Rain', 'Sunny']

In [64]:
import itertools

# Use itertools.product to find all combinations
combinations = list(itertools.product(outlook_values, tennis_values))

# Print the result
for combination in combinations:
    print(combination)

('Overcast', 'No')
('Overcast', 'Yes')
('Rain', 'No')
('Rain', 'Yes')
('Sunny', 'No')
('Sunny', 'Yes')


In [65]:
xvalues= []
for combination in combinations:
    xvalues.append(f'{combination[0]}-{combination[1]}')
xvalues

['Overcast-No', 'Overcast-Yes', 'Rain-No', 'Rain-Yes', 'Sunny-No', 'Sunny-Yes']

In [72]:

res = []

for combination in combinations:
    count_high = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Humidity'] == 'High')].shape[0]
    count_normal = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Humidity'] == 'Normal')].shape[0]
    
    res.append((combination[0], combination[1], count_high, count_normal))

for item in res:
    print(f'Outlook: {item[0]}, Tennis: {item[1]}, High: {item[2]}, Normal: {item[3]}')

Outlook: Overcast, Tennis: No, High: 0, Normal: 0
Outlook: Overcast, Tennis: Yes, High: 2, Normal: 2
Outlook: Rain, Tennis: No, High: 1, Normal: 1
Outlook: Rain, Tennis: Yes, High: 1, Normal: 2
Outlook: Sunny, Tennis: No, High: 3, Normal: 0
Outlook: Sunny, Tennis: Yes, High: 0, Normal: 2


In [73]:
obs = pd.DataFrame(0, columns=yvalues, index=xvalues)
obs

Unnamed: 0,High,Normal
Overcast-No,0,0
Overcast-Yes,0,0
Rain-No,0,0
Rain-Yes,0,0
Sunny-No,0,0
Sunny-Yes,0,0


In [82]:
high =[]
normal =[]
for i in range(len(res)):
    high.append(res[i][2])
    normal.append(res[i][3])
obs['High']=high
obs['Normal']=normal
obs

Unnamed: 0,High,Normal
Overcast-No,0,0
Overcast-Yes,2,2
Rain-No,1,1
Rain-Yes,1,2
Sunny-No,3,0
Sunny-Yes,0,2


In [84]:
m = obs.sum(axis=1)
l = obs.sum(axis=0)

In [87]:
obs['m'] = m
obs.loc['l'] = l
obs

Unnamed: 0,High,Normal,m
Overcast-No,0.0,0.0,0.0
Overcast-Yes,2.0,2.0,4.0
Rain-No,1.0,1.0,2.0
Rain-Yes,1.0,2.0,3.0
Sunny-No,3.0,0.0,3.0
Sunny-Yes,0.0,2.0,2.0
l,7.0,7.0,


In [88]:
p_h_o_y = pd.DataFrame(0, columns=yvalues, index=xvalues)
for x in xvalues:
    p_h_o_y.loc[x] = obs[yvalues].loc[x] / obs['m'].loc[x]
p_h_o_y

Unnamed: 0,High,Normal
Overcast-No,,
Overcast-Yes,0.5,0.5
Rain-No,0.5,0.5
Rain-Yes,0.333333,0.666667
Sunny-No,1.0,0.0
Sunny-Yes,0.0,1.0


In [92]:
np.random.choice(yvalues, size=10, p=p_h_o_y.loc['Sunny-No'])

array(['High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High'], dtype='<U6')

### P(w|o y)

In [136]:
N_outlook = df['Outlook'].size
N_tennis = df['Tennis'].size

In [137]:
outlook_values = np.unique(X['Outlook'].values).tolist()
tennis_values = np.unique(df['Tennis'].values).tolist()
yvalues = np.unique(X['Wind'].values).tolist()
outlook_values

['Overcast', 'Rain', 'Sunny']

In [138]:
import itertools

# Use itertools.product to find all combinations
combinations = list(itertools.product(outlook_values, tennis_values))

# Print the result
for combination in combinations:
    print(combination)

('Overcast', 'No')
('Overcast', 'Yes')
('Rain', 'No')
('Rain', 'Yes')
('Sunny', 'No')
('Sunny', 'Yes')


In [139]:
xvalues= []
for combination in combinations:
    xvalues.append(f'{combination[0]}-{combination[1]}')
xvalues

['Overcast-No', 'Overcast-Yes', 'Rain-No', 'Rain-Yes', 'Sunny-No', 'Sunny-Yes']

In [140]:
res = []

for combination in combinations:
    count_high = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Wind'] == 'Weak')].shape[0]
    count_normal = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Wind'] == 'Strong')].shape[0]
    
    res.append((combination[0], combination[1], count_high, count_normal))

for item in res:
    print(f'Outlook: {item[0]}, Tennis: {item[1]}, Weak: {item[2]}, Strong: {item[3]}')

Outlook: Overcast, Tennis: No, Weak: 0, Strong: 0
Outlook: Overcast, Tennis: Yes, Weak: 2, Strong: 2
Outlook: Rain, Tennis: No, Weak: 0, Strong: 2
Outlook: Rain, Tennis: Yes, Weak: 3, Strong: 0
Outlook: Sunny, Tennis: No, Weak: 2, Strong: 1
Outlook: Sunny, Tennis: Yes, Weak: 1, Strong: 1


In [141]:
obs = pd.DataFrame(0, columns=yvalues, index=xvalues)
obs

Unnamed: 0,Strong,Weak
Overcast-No,0,0
Overcast-Yes,0,0
Rain-No,0,0
Rain-Yes,0,0
Sunny-No,0,0
Sunny-Yes,0,0


In [142]:
weak =[]
strong =[]
for i in range(len(res)):
    weak.append(res[i][2])
    strong.append(res[i][3])
obs['Weak']=weak
obs['Strong']=strong
obs

Unnamed: 0,Strong,Weak
Overcast-No,0,0
Overcast-Yes,2,2
Rain-No,2,0
Rain-Yes,0,3
Sunny-No,1,2
Sunny-Yes,1,1


In [143]:
m = obs.sum(axis=1)
l = obs.sum(axis=0)

In [144]:
obs['m'] = m
obs.loc['l'] = l
obs

Unnamed: 0,Strong,Weak,m
Overcast-No,0.0,0.0,0.0
Overcast-Yes,2.0,2.0,4.0
Rain-No,2.0,0.0,2.0
Rain-Yes,0.0,3.0,3.0
Sunny-No,1.0,2.0,3.0
Sunny-Yes,1.0,1.0,2.0
l,6.0,8.0,


In [145]:
p_w_o_y = pd.DataFrame(0, columns=yvalues, index=xvalues)
for x in xvalues:
    if obs['m'].loc[x] != 0:
        p_w_o_y.loc[x] = obs[yvalues].loc[x] / obs['m'].loc[x]
    else: 
         p_w_o_y.loc[x] = 0
p_w_o_y

Unnamed: 0,Strong,Weak
Overcast-No,0.0,0.0
Overcast-Yes,0.5,0.5
Rain-No,1.0,0.0
Rain-Yes,0.0,1.0
Sunny-No,0.333333,0.666667
Sunny-Yes,0.5,0.5


In [148]:
np.random.choice(yvalues, size=10, p=p_w_o_y.loc['Overcast-Yes'])

array(['Weak', 'Weak', 'Strong', 'Weak', 'Strong', 'Weak', 'Strong',
       'Weak', 'Weak', 'Strong'], dtype='<U6')

### P(t| y o h w)

In [119]:
N_outlook = df['Outlook'].size
N_tennis = df['Tennis'].size
N_hummidity = df['Humidity'].size
N_wind = df['Wind'].size

In [120]:
outlook_values = np.unique(X['Outlook'].values).tolist()
tennis_values = np.unique(df['Tennis'].values).tolist()
humidity_values = np.unique(X['Humidity'].values).tolist()
wind_values = np.unique(df['Wind'].values).tolist()
yvalues = np.unique(X['Temp'].values).tolist()
outlook_values

['Overcast', 'Rain', 'Sunny']

In [121]:
import itertools

# Use itertools.product to find all combinations
combinations = list(itertools.product(outlook_values, tennis_values, humidity_values, wind_values))

# Print the result
for combination in combinations:
    print(combination)

('Overcast', 'No', 'High', 'Strong')
('Overcast', 'No', 'High', 'Weak')
('Overcast', 'No', 'Normal', 'Strong')
('Overcast', 'No', 'Normal', 'Weak')
('Overcast', 'Yes', 'High', 'Strong')
('Overcast', 'Yes', 'High', 'Weak')
('Overcast', 'Yes', 'Normal', 'Strong')
('Overcast', 'Yes', 'Normal', 'Weak')
('Rain', 'No', 'High', 'Strong')
('Rain', 'No', 'High', 'Weak')
('Rain', 'No', 'Normal', 'Strong')
('Rain', 'No', 'Normal', 'Weak')
('Rain', 'Yes', 'High', 'Strong')
('Rain', 'Yes', 'High', 'Weak')
('Rain', 'Yes', 'Normal', 'Strong')
('Rain', 'Yes', 'Normal', 'Weak')
('Sunny', 'No', 'High', 'Strong')
('Sunny', 'No', 'High', 'Weak')
('Sunny', 'No', 'Normal', 'Strong')
('Sunny', 'No', 'Normal', 'Weak')
('Sunny', 'Yes', 'High', 'Strong')
('Sunny', 'Yes', 'High', 'Weak')
('Sunny', 'Yes', 'Normal', 'Strong')
('Sunny', 'Yes', 'Normal', 'Weak')


In [123]:
len(combinations)

24

In [124]:
xvalues= []
for combination in combinations:
    xvalues.append(f'{combination[0]}-{combination[1]}-{combination[2]}-{combination[3]}')
xvalues

['Overcast-No-High-Strong',
 'Overcast-No-High-Weak',
 'Overcast-No-Normal-Strong',
 'Overcast-No-Normal-Weak',
 'Overcast-Yes-High-Strong',
 'Overcast-Yes-High-Weak',
 'Overcast-Yes-Normal-Strong',
 'Overcast-Yes-Normal-Weak',
 'Rain-No-High-Strong',
 'Rain-No-High-Weak',
 'Rain-No-Normal-Strong',
 'Rain-No-Normal-Weak',
 'Rain-Yes-High-Strong',
 'Rain-Yes-High-Weak',
 'Rain-Yes-Normal-Strong',
 'Rain-Yes-Normal-Weak',
 'Sunny-No-High-Strong',
 'Sunny-No-High-Weak',
 'Sunny-No-Normal-Strong',
 'Sunny-No-Normal-Weak',
 'Sunny-Yes-High-Strong',
 'Sunny-Yes-High-Weak',
 'Sunny-Yes-Normal-Strong',
 'Sunny-Yes-Normal-Weak']

In [126]:
res = []

for combination in combinations:
    count_hot = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Humidity'] == combination[2]) & (df['Wind'] == combination[3]) & (df['Temp'] == 'Hot')].shape[0]
    count_cool = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Humidity'] == combination[2]) & (df['Wind'] == combination[3]) & (df['Temp'] == 'Cool')].shape[0]
    count_mild = df[(df['Outlook'] == combination[0]) & (df['Tennis'] == combination[1]) & (df['Humidity'] == combination[2]) & (df['Wind'] == combination[3]) & (df['Temp'] == 'Mild')].shape[0]
    
    res.append((combination[0], combination[1] ,combination[2], combination[3], count_hot, count_cool ,count_mild))

for item in res:
    print(f'Outlook: {item[0]}, Tennis: {item[1]}, Humidity: {item[2]}, Wind: {item[3]}, Hot: {item[4]}, Cool: {item[5]}, Mild: {item[6]}')

Outlook: Overcast, Tennis: No, Humidity: High, Wind: Strong, Hot: 0, Cool: 0, Mild: 0
Outlook: Overcast, Tennis: No, Humidity: High, Wind: Weak, Hot: 0, Cool: 0, Mild: 0
Outlook: Overcast, Tennis: No, Humidity: Normal, Wind: Strong, Hot: 0, Cool: 0, Mild: 0
Outlook: Overcast, Tennis: No, Humidity: Normal, Wind: Weak, Hot: 0, Cool: 0, Mild: 0
Outlook: Overcast, Tennis: Yes, Humidity: High, Wind: Strong, Hot: 0, Cool: 0, Mild: 1
Outlook: Overcast, Tennis: Yes, Humidity: High, Wind: Weak, Hot: 1, Cool: 0, Mild: 0
Outlook: Overcast, Tennis: Yes, Humidity: Normal, Wind: Strong, Hot: 0, Cool: 1, Mild: 0
Outlook: Overcast, Tennis: Yes, Humidity: Normal, Wind: Weak, Hot: 1, Cool: 0, Mild: 0
Outlook: Rain, Tennis: No, Humidity: High, Wind: Strong, Hot: 0, Cool: 0, Mild: 1
Outlook: Rain, Tennis: No, Humidity: High, Wind: Weak, Hot: 0, Cool: 0, Mild: 0
Outlook: Rain, Tennis: No, Humidity: Normal, Wind: Strong, Hot: 0, Cool: 1, Mild: 0
Outlook: Rain, Tennis: No, Humidity: Normal, Wind: Weak, Hot: 

In [127]:
obs = pd.DataFrame(0, columns=yvalues, index=xvalues)
obs

Unnamed: 0,Cool,Hot,Mild
Overcast-No-High-Strong,0,0,0
Overcast-No-High-Weak,0,0,0
Overcast-No-Normal-Strong,0,0,0
Overcast-No-Normal-Weak,0,0,0
Overcast-Yes-High-Strong,0,0,0
Overcast-Yes-High-Weak,0,0,0
Overcast-Yes-Normal-Strong,0,0,0
Overcast-Yes-Normal-Weak,0,0,0
Rain-No-High-Strong,0,0,0
Rain-No-High-Weak,0,0,0


In [129]:
hot =[]
cool =[]
mild = []
for i in range(len(res)):
    hot.append(res[i][4])
    cool.append(res[i][5])
    mild.append(res[i][6])
obs['Hot']=hot
obs['Cool']=cool
obs['Mild']=mild
obs

Unnamed: 0,Cool,Hot,Mild
Overcast-No-High-Strong,0,0,0
Overcast-No-High-Weak,0,0,0
Overcast-No-Normal-Strong,0,0,0
Overcast-No-Normal-Weak,0,0,0
Overcast-Yes-High-Strong,0,0,1
Overcast-Yes-High-Weak,0,1,0
Overcast-Yes-Normal-Strong,1,0,0
Overcast-Yes-Normal-Weak,0,1,0
Rain-No-High-Strong,0,0,1
Rain-No-High-Weak,0,0,0


In [130]:
m = obs.sum(axis=1)
l = obs.sum(axis=0)

In [131]:
obs['m'] = m
obs.loc['l'] = l
obs

Unnamed: 0,Cool,Hot,Mild,m
Overcast-No-High-Strong,0.0,0.0,0.0,0.0
Overcast-No-High-Weak,0.0,0.0,0.0,0.0
Overcast-No-Normal-Strong,0.0,0.0,0.0,0.0
Overcast-No-Normal-Weak,0.0,0.0,0.0,0.0
Overcast-Yes-High-Strong,0.0,0.0,1.0,1.0
Overcast-Yes-High-Weak,0.0,1.0,0.0,1.0
Overcast-Yes-Normal-Strong,1.0,0.0,0.0,1.0
Overcast-Yes-Normal-Weak,0.0,1.0,0.0,1.0
Rain-No-High-Strong,0.0,0.0,1.0,1.0
Rain-No-High-Weak,0.0,0.0,0.0,0.0


In [134]:
p_t_o_y_h_w = pd.DataFrame(0, columns=yvalues, index=xvalues)
for x in xvalues:
    if obs['m'].loc[x] != 0:
        p_t_o_y_h_w.loc[x] = obs[yvalues].loc[x] / obs['m'].loc[x]
    else:
        p_t_o_y_h_w.loc[x] = 0
p_t_o_y_h_w

Unnamed: 0,Cool,Hot,Mild
Overcast-No-High-Strong,0.0,0.0,0.0
Overcast-No-High-Weak,0.0,0.0,0.0
Overcast-No-Normal-Strong,0.0,0.0,0.0
Overcast-No-Normal-Weak,0.0,0.0,0.0
Overcast-Yes-High-Strong,0.0,0.0,1.0
Overcast-Yes-High-Weak,0.0,1.0,0.0
Overcast-Yes-Normal-Strong,1.0,0.0,0.0
Overcast-Yes-Normal-Weak,0.0,1.0,0.0
Rain-No-High-Strong,0.0,0.0,1.0
Rain-No-High-Weak,0.0,0.0,0.0


In [133]:
np.random.choice(yvalues, size=10, p=p_t_o_y_h_w.loc['Overcast-Yes-High-Strong'])

array(['Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild'], dtype='<U4')