In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class NaiveException(Exception):
  pass

In [None]:
def calc_class_proba(input_df, target_name):
    """
    Calculate the class probabilities for a binary or multi-class classification problem.

    Parameters:
        input_df (pd.DataFrame): The input DataFrame containing the dataset.
        target_name (str): The name of the target column (class labels).

    Returns:
        dict: A dictionary where the keys are unique class labels from the target column,
              and the values are the corresponding class probabilities.

    Raises:
        ValueError: If the specified target column does not exist in the input DataFrame.

    Note:
    - The function assumes that the target column contains categorical class labels.
    - The probabilities are calculated as the relative frequencies of each class in the dataset.
    - The returned probabilities are in the form of a dictionary for easy access.
    - The function raises a ValueError if the target column is not found in the input DataFrame.
    """
    if target_name not in input_df.columns.tolist():
        raise ValueError(f"Target column '{target_name}' not found in input DataFrame columns.")

    class_counts = input_df[target_name].value_counts()
    total_samples = len(input_df)
    probs = class_counts / total_samples

    return probs.to_dict()

In [None]:
def calc_n_nprime(column, cval, target_val, df):
  """
  Calculate n and n' values for a specific column, value, and target in a DataFrame.

  Parameters:
      column (str): The name of the column to consider.
      cval: The specific value to count within the column.
      target_val: The target value for filtering the DataFrame.
      df (pd.DataFrame): The input DataFrame containing the dataset.

  Returns:
      tuple: A tuple containing two values: n_prime and n.
          - n_prime (int): The count of samples where 'column' matches 'cval' and 'Target' matches 'target_val'.
          - n (int): The count of samples where 'Target' matches 'target_val'.

  Raises:
      ValueError: If 'column' or 'target_val' is not found in DataFrame.

  This function calculates the values n and n' for a given column, value, and target class in a DataFrame.
    - n_prime represents the count of samples where the specified 'column' matches 'cval' and 'Target' matches 'target_val'.
    - n represents the count of samples where 'Target' matches 'target_val'.
  It can be used for calculations in the context of Naive Bayes classification.
  """
  if column not in df.columns:
    raise NaiveException(f'Input column {column} not found in dataframe.')
  if cval not in df[column].unique():
    raise NaiveException(f'Input column value not found in input column.')
  if target_val not in df['Target'].unique():
    raise NaiveException(f'Input target value not found in Target column.')


  n_prime = (df[df['Target'] == target_val][column] == cval).sum()
  n = (df[df['Target'] == target_val]).shape[0]

  return n_prime, n


In [None]:
def calc_function1(input_m, df, input_vector):

  """
  Calculate conditional probabilities for each target class given an input vector.

  Parameters:
      input_m (int): An arbitrary value (m) used in the probability calculation.
      df (pd.DataFrame): The input DataFrame containing the dataset.
      input_vector (list): A list representing the input vector for which conditional probabilities are calculated.

  Returns:
      dict: A dictionary where keys are in the form 'P(x / target_val)' and values are the corresponding conditional probabilities.

  Note:
    - The function assumes a binary classification problem (two classes) with a categorical target.
    - The 'm' parameter can be used for Laplace smoothing (set to 0 for no smoothing).
    - The resulting dictionary provides conditional probabilities for each target class in the dataset.
  """

  dic_target_probs = calc_class_proba(df, 'Target')

  # Section: Calculate the p-values
  p_v = [ (1/len(df[x].unique())) for x in df.columns[:-1]]
  dic_probs = {}

  for target_val in dic_target_probs.keys():
    aux_val = 1
    for i in range(len(input_vector)):
      actual_column = df.columns[i] # Get the column
      actual_vector_value = input_vector[i] # Get the desired value

      # Section: Calculate n', n, m and p
      n_prime = (df[df['Target'] == target_val][actual_column] == actual_vector_value).sum()
      n = (df[df['Target'] == target_val]).shape[0]
      p = p_v[i]
      if input_m == 0:
        m = len(dic_target_probs.keys())
      else:
        m = input_m

      # Section: Calculate the final value
      final_value = (n_prime + m*p)/(n+m)
      # print(f'Target {target_val} Column {actual_column} Value: {actual_vector_value} n prima:{n_prime}')

      # Section: Multiply by the previous value to store the result
      aux_val = aux_val * final_value

    # Section: Append the data into the final dictionary
    dic_probs[f'P(x / {target_val})'] = aux_val

  return dic_probs


In [None]:
def Naive_Bayes(input_m, df, input_vector):
  """
  Calculate posterior probabilities for each target class given an input vector.

  Parameters:
      input_m (int): An arbitrary value (m) used in the probability calculation.
      df (pd.DataFrame): The input DataFrame containing the dataset.
      input_vector (list): A list representing the input vector for which posterior probabilities are calculated.

  Returns:
      dict: A dictionary where keys are in the form 'P(target_val / x)' and values are the corresponding posterior probabilities.

  Note:
    - The function assumes a binary classification problem (two classes) with a categorical target.
    - The 'm' parameter can be used for Laplace smoothing (set to 0 for no smoothing).
    - The resulting dictionary provides posterior probabilities for each target class in the dataset.
  """
  if 'Target' not in df.columns:
    raise NaiveException(f'Target columns not found. Raname the target feature to Target.')

  probs = calc_function1(input_m, df, input_vector)
  target_probs = calc_class_proba(df, 'Target')
  final_probs = {}

  denominator = np.dot(list(probs.values()), list(target_probs.values()))

  for i in target_probs.keys():
    prob = target_probs.get(i)
    prob_t = probs.get(f'P(x / {i})')
    numerator = prob * prob_t

    final_probs[f'P({i} / x)'] = (numerator) / (denominator)

  return final_probs