In [None]:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import scipy.stats as stats
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_excel('Pumpkin_Seeds_Dataset.xlsx')
df.head

In [None]:
x = df[['Area', 'Convex_Area', 'Eccentricity', 'Compactness', 'Aspect_Ration']].copy()

labels = x.columns.to_list()
y = df['Class']
y_map = {label: value for value, label in enumerate(np.unique(y))}
y_labeled = y.map(y_map).values

x_train, x_test, y_train, y_test = train_test_split(x, y_labeled, test_size=0.3, random_state=42)

In [None]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
labels.insert(0, "Intercept")
scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_train_scaled_int = np.hstack((np.ones((x_train_scaled.shape[0], 1)), x_train_scaled))
x_test_scaled_int = np.hstack((np.ones((x_test_scaled.shape[0], 1)), x_test_scaled))

In [None]:
coords = {"coeffs": labels}

with pm.Model(coords=coords) as model:
    # data containers
    X = pm.MutableData("X", x_train_scaled_int)
    y = pm.MutableData("y", y_train)
    # priors
    b = pm.Normal("b", mu=0, sigma=1, dims="coeffs")
    # linear model
    mu = pm.math.dot(X, b)
    # link function
    p = pm.Deterministic("p", pm.invprobit(mu)) #probit link function
    # likelihood
    obs = pm.Bernoulli("obs", p=p, observed=y)

pm.model_to_graphviz(model)

In [None]:
with model:
    idata = pm.sample()