<a href="https://colab.research.google.com/github/Mahnazshamissa/Python/blob/main/prior_and_posteriors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creation of the prior and posteriors table
from data provided by Katia

In [7]:
import pandas as pd

In [8]:
# 0. imports
import numpy as np
import pickle

# 1. load the data
with open('/content/fruit_data.pkl','rb') as flBytes:
    raw = pickle.load(flBytes)
 

In [9]:
columnNames=['fruit','long','sweet','yellow','seeds','Brazil']

In [10]:
dfRaw = pd.DataFrame(raw,columns = columnNames,dtype='str')
dfRaw[dfRaw.columns[1:]] = dfRaw[dfRaw.columns[1:]].astype('int8')
dfPosts = dfRaw.groupby(dfRaw.fruit).sum()
dfPosts['total'] = dfRaw.fruit.groupby(dfRaw.fruit).count()
dfPosts.loc['combined']= dfPosts.sum()
dfPostsNorm = dfPosts.copy()
dfPosts['total']

fruit
bananas      400.0
mangoes      300.0
oranges      500.0
others       200.0
combined    1400.0
Name: total, dtype: float64

In [11]:
dfPostsNorm[dfPostsNorm.columns[:-1]] = (dfPostsNorm[dfPostsNorm.columns[:-1]].T / dfPostsNorm.total).T
dfPostsNorm
dfPostsNorm.total = dfPostsNorm.total / dfPostsNorm.total.loc['combined']
dfPostsNorm


Unnamed: 0_level_0,long,sweet,yellow,seeds,Brazil,total
fruit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bananas,0.9,0.625,0.975,0.0,0.1625,0.285714
mangoes,0.366667,0.666667,0.4,0.0,1.0,0.214286
oranges,0.006,0.92,0.06,0.94,0.02,0.357143
others,0.225,1.0,0.5,0.95,0.0,0.142857
combined,0.37,0.792857,0.457143,0.471429,0.267857,1.0


dfPostsNorm[dfPostsNorm.columns[:-1]] = (dfPostsNorm[dfPostsNorm.columns[:-1]].T / dfPostsNorm.total).T
dfPostsNorm

In [12]:
# 2. retrieve general information
rawRows = raw.shape[0] #total number of fruits
rawCols = raw.shape[1] #number of feats, including fruit type 
 
fruitTypes = np.unique(raw[:,0],return_counts=True)

# 3. create sub arrays per fruit and add them to a dictionary with fruit names as keys
fruitDict = {}
for i,el in enumerate(fruitTypes[0]):
    fruitDict[el] =  [i,raw[np.where(raw[:,0]==el)][:,1:].astype(float)] # first elemento of the list is an id in numeric form that can be stored numerically in the numeric numpy
    

In [13]:
# 4. create the empty posteriors matrix (not normalized)
postRows = fruitTypes[0].size + 1 # add a sum row
postCols = 2*(rawCols - 1) + 2 #add an index column, and a sum column
posteriors = np.zeros((postRows,postCols)) 


In [14]:
# 5. fill for each fruit its correspondent row in the posteriors matrix
for k,v in fruitDict.items():
    rowId = v[0] # id of the fruit, in numeric form
    amount = v[1].shape[0] # could have used also fruitTypes[1]
    additions = np.sum(v[1],axis=0)
    posteriors[rowId,0] = rowId # first col of the posteriors array is the id of the fruit
    posteriors[rowId,-1] = amount # last col of the posteriors array is the total amount of each fruit
    for j in range(1,rawCols):
        posteriors[rowId,2*j-1] = additions[j-1]# fulfilling the feat
        posteriors[rowId,2*j] = amount - additions[j-1] # not fullfilling the feat

# 6. add a final row that sums all the values above
posteriors[-1] = np.sum(posteriors,axis=0)

In [15]:
# 7. create the posteeriorsNorm: the posteriors normalized to the values to the specific fruit total amount
## for that, i took the last column, the amount of each fruit, and broadcasted it to an array of the same dimensions of the posteriors, where all the values of the row are the same: the amount of each fruit
posteriorsNorm = posteriors.copy()
arrayTotals = np.broadcast_to(posteriorsNorm[:,-1:],posteriorsNorm.shape)
posteriorsNorm = posteriorsNorm / arrayTotals


In [16]:
# 8. as the operation before also edited the id column, rewrite the first column to its original
posteriorsNorm[:,0] = posteriors[:,0]

# 9. the last column normalized lacks value: rewrite to the total amount per fruit again, as in posteriors, and normalize to total fruits
posteriorsNorm[:,-1] = posteriors[:,-1]/posteriors[-1,-1]