# Confidence Interval

There are multiple situation where we need to know Confidence interval. In order to address this issue in following table present 6 differents scenarios and approaches/formula to be considered. 

Coupled with this table is the code which will allow us to determine easily confince interval using Student t-table as well as Z-table.


|ID| # Population | Population Variance     | Samples     | Statistic |  Variance            | ID Formula |
|:---|:-------------|:------------------------|:------------|:----------|:-------------------------|------------:|
| 1 | One          | Known                   | -           |  z        | $\sigma^2$               | \begin{equation}
\bar{X} \pm Z_{\alpha/2} \times \frac{\sigma}{\sqrt{n}}
\end{equation}     |
| 2 | One          | Unknown                 | -           |  t        | $s^2$                    |\begin{equation}
\bar{X} \pm t_{n-1,\alpha/2} \times \frac{s}{\sqrt{n}}
\end{equation}|
| 3 | Two          | -                       | Dependent   |  t        | $s^2_{difference}$       |\begin{equation}
\bar{d} \pm t_{n-1,\alpha/2} \times \frac{s_d}{\sqrt{n}}
\end{equation}|
| 4 | Two          | Known                   | Independent |  z        | $\sigma^2_x, \sigma^2_y$ |\begin{equation}
(\bar{x} - \bar{y}) \pm Z_{\alpha/2} \sqrt{\frac{\sigma^2_x}{n_x} + \frac{\sigma^2_y}{n_y}}
\end{equation}|
| 5 | Two          | Unkown, assumed equal | Indepent    |  t        | $s^2_{p}$ (eq_s2p)        |\begin{equation}
(\bar{x} - \bar{y}) \pm t_{n_x+n_y - 2, \alpha/2} \sqrt{\frac{s^2_p}{n_x} + \frac{s^2_p}{n_y}}
\end{equation}|
| 6 | Two          | Unkown, assumed different | Independent |  t        | $s^2_x, s^2_y$           |\begin{equation}
(\bar{x} - \bar{y}) \pm t_{\upsilon, \alpha/2} \sqrt{\frac{s^2_x}{n_x} + \frac{s^2_y}{n_y}}
\end{equation}|






\begin{equation}
    s_p^2 = \frac{(n_x-1) \times s_x^2 + (n_y - 1) \times s_y^2}{n_x + n_y -2}
    \tag{eq_s2p}
    \label{variance_s2p}
\end{equation} 

## Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

## Creating Class, Confidence_interval

In [2]:
class Confidence_interval:
    def __init__(self, data, data2=None, z_half=None, tn_1_half=None, tnx_ny_2_half=None, tv_half=None, std=None, std2=None):
        self.data = data
        self.data2=data2
        self.z_half = z_half
        self.tn_1_half = tn_1_half
        self.tnx_ny_2_half = tnx_ny_2_half
        self.tv_half = tv_half
        self.std = std
        self.std2 = std2
        
        # Calculate attributes
            # sample 1
        self.var_sample1 = np.std(data)
        self.std_sample1 = np.std(data)
        self.mean_sample1 = np.mean(data)
        self.n1 = len(data)
            # sample 2
        try:
            if len(self.data2) != None:
                self.var_sample2 = np.std(data2)
                self.std_sample2 = np.std(data2)
                self.mean_sample2 = np.mean(data2)
                self.n2 = len(data2)
        except:
            pass
        
        
    def one_var_known(self):
        """ 
        1: One data, Known POPULATION (original data) variance
        """
        low = self.mean_sample1 - self.z_half * self.std / np.sqrt(self.n1)
        high = self.mean_sample1 + self.z_half * self.std / np.sqrt(self.n1)
        return low, high
    
    def one_var_unknown(self):
        """ 
        2: One data, Unknown POPULATION (original data) variance
        """
        low = self.mean_sample1 - self.tn_1_half * self.std_sample1 / np.sqrt(self.n1)
        high = self.mean_sample1 + self.tn_1_half * self.var_sample1 / np.sqrt(self.n1)
        return low, high
    
    def two_dependent(self):#(data1, data2, tn_1_half, std_sample_d, ):
        """ 
        3:  Two data, no info POPULATION (original data) variance, DEPENDENT
        """
        if len(self.data) == len(self.data2):
            mean = np.mean(self.data - self.data2)
            std_sample_d = np.std(self.data - self.data2)
            low = mean - self.tn_1_half * std_sample_d / np.sqrt(self.n1)
            high = mean + self.tn_1_half * std_sample_d / np.sqrt(self.n1)
            return  low, high

        else:
            print('\n....\nERROR: diffent data size\n...check inputs data')
    

    def two_known_ind(self):
        """ 
        4: Two data, Known POPULATION (original data) variance, Independent from each other
        """
        low = (self.mean_sample1 - self.mean_sample2) - self.z_half * np.sqrt(self.std**2/self.n1 + self.std2**2/self.n2)
        high = (self.mean_sample1 - self.mean_sample2) + self.z_half * np.sqrt(self.std**2/self.n1 + self.std2**2/self.n2)
        return low, high
    
    def variance_p(self):
        s2_p_numerator = (self.n1-1)* self.var_sample1 + (self.n2-1)*self.var_sample2
        s2_p_denominator = self.n1 + self.n2 -2
        return s2_p_numerator / s2_p_denominator
    
    def two_var_assumed_equal(self):#(data1, data2, tnx_ny_2_half):
        """ 
        5: Two data, Unknown their POPULATION (original data) variance but Assumed being EQUAL, Independent each other
        """
        var_p = self.variance_p()
        low = (self.mean_sample1 - self.mean_sample2) - self.tnx_ny_2_half * np.sqrt(var_p**2/self.n1 + var_p**2/self.n2)
        high = (self.mean_sample1 - self.mean_sample2) + self.tnx_ny_2_half * np.sqrt(var_p**2/self.n1 + var_p**2/self.n2)
        return low, high
    
    def two_var_assumed_not_equal(self):
        low = (self.mean_sample1 - self.mean_sample2) - self.tv_half * np.sqrt(self.std_sample1**2/self.n1 + self.std_sample2**2/self.n2)
        high = (self.mean_sample1 - self.mean_sample2) + self.tv_half * np.sqrt(self.std_sample1**2/self.n1 + self.std_sample2**2/self.n2)
        return low, high

## Use case

In [3]:
# Seed for reproducibity
np.random.seed(505)

In [4]:
# Generating data (data and data2)
data  = np.array([ np.mean(np.random.choice(np.linspace(100, 250, 150), 10)) for i in range(200)])
data2 = np.array([ np.mean(np.random.choice(np.linspace(120, 270, 150), 10)) for i in range(20)])

In [5]:
# 6 use case of the class Confidence_interval 

print('One Data with Known population variance ')
print('--> aux = Confidence_interval(data, z_half=2, std=30) \n--> aux.one_var_known()')
aux = Confidence_interval(data, z_half=2, std=30)
print(aux.one_var_known(), '\n\n')


print('One Data with Unknown population variance ')
print('--> aux = Confidence_interval(data, tn_1_half=2, std=30) \n--> aux.one_var_unknown()')
aux = Confidence_interval(data, tn_1_half=2, std=30)
print(aux.one_var_unknown(), '\n\n')


print('Two Data Dependent')
print('--> aux =  Confidence_interval(data, data2=data2, tn_1_half=2) \n--> aux.two_dependent()')
aux =  Confidence_interval(data, data2=data2, tn_1_half=2)
print(aux.two_dependent(), '\n\n')



print('Two Independent Data Known Variance')
print('--> aux =  Confidence_interval(data, data2=data2, z_half=2, std=4, std2=4) \n--> aux.two_known_ind()')
aux =  Confidence_interval(data, data2=data2, z_half=2, std=4, std2=4)
print(aux.two_known_ind(), '\n\n')


print('Two Independent Data with variance assumed to be EQUAL')
print('--> aux =  Confidence_interval(data, data2=data2, z_half=2, std=4, std2=4) \n--> aux.two_known_ind()')
aux =  Confidence_interval(data, data2=data2, tnx_ny_2_half=3)
print(aux.two_var_assumed_equal(), '\n\n')


print('Two Independent Data with variance assumed to be NOT EQUAL')
print('--> aux =  Confidence_interval(data, data2=data2, tv_half=3) \n--> aux.two_var_assumed_not_equal()')
aux =  Confidence_interval(data, data2=data2, tv_half=3)
print(aux.two_var_assumed_not_equal())

One Data with Known population variance 
--> aux = Confidence_interval(data, z_half=2, std=30) 
--> aux.one_var_known()
(170.39443984979349, 178.87972122403207) 


One Data with Unknown population variance 
--> aux = Confidence_interval(data, tn_1_half=2, std=30) 
--> aux.one_var_unknown()
(172.8831951554691, 176.39096591835644) 


Two Data Dependent
--> aux =  Confidence_interval(data, data2=data2, tn_1_half=2) 
--> aux.two_dependent()

....
ERROR: diffent data size
...check inputs data
None 


Two Independent Data Known Variance
--> aux =  Confidence_interval(data, data2=data2, z_half=2, std=4, std2=4) 
--> aux.two_known_ind()
(-26.930360934801847, -23.1780283269431) 


Two Independent Data with variance assumed to be EQUAL
--> aux =  Confidence_interval(data, data2=data2, z_half=2, std=4, std2=4) 
--> aux.two_known_ind()
(-33.91539057346099, -16.192998688283957) 


Two Independent Data with variance assumed to be NOT EQUAL
--> aux =  Confidence_interval(data, data2=data2, tv_half=3)