In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import scipy.integrate as integrate
import scipy.special as special

In [110]:
class Data:
    def __init__(self, inputData):
        self.dataSeries = inputData
        self.n = inputData.size
        
        
    def variational_series(self):
        variational_series = self.dataSeries.value_counts().rename_axis('x_i').reset_index(name='n_i')
        variational_series = variational_series.sort_values(['x_i'])
        variational_series['w_i'] = variational_series['n_i'].apply(lambda n_i: n_i/self.n)
        return variational_series
    
    
    def discrete_sample_mean(self):
        xData = self.variational_series()['x_i'].tolist()
        nData = self.variational_series()['n_i'].tolist()

        res = 0
        for i in range(len(xData)):
            res += xData[i] * nData[i]
        res /= self.n

        return res


    def discrete_D(self):
            xData = self.variational_series()['x_i'].tolist()
            nData = self.variational_series()['n_i'].tolist()
            x_s = self.discrete_sample_mean()

            res = 0
            for i in range(len(xData)):
                res += (xData[i] - x_s) * (xData[i] - x_s) * nData[i]
            res /= self.n

            return res
    
    
    def discrete_standard_error(self):
        return np.sqrt(self.discrete_D())
    
    
    def big_F(self, x):
        res = integrate.quad(lambda z: np.power(np.e, -z * z / 2), 0, x)[0]
        res *= 1 / np.sqrt(2 * np.pi)
        return res
    
    
    def interval(self, x):  #1.96
        interval = []
        temp = self.discrete_sample_mean() - self.discrete_standard_error() * x / np.sqrt(self.n)
        interval.append(temp)
        temp = self.discrete_sample_mean() + self.discrete_standard_error() * x / np.sqrt(self.n)
        interval.append(temp)
        
        return interval
    
    
    def interval_series(self):
        xData = self.variational_series()['x_i'].tolist()
        nData = self.variational_series()['n_i'].tolist()
        
        k = 1 + 3.322 * np.log10(self.n)
        h = round((xData[-1] - xData[0]) / k, 3)
        value = xData[0] - h / 2
        
        intervalStart = []
        intervalEnd = []
        n = []
        
        while True:
            intervalStart.append(round(value, 4))
            value += h
            intervalEnd.append(round(value, 4))
            
            if value >= xData[-1]:
                break
        intervalStart[0] = xData[0]
        intervalEnd[-1] = xData[-1]
        
        n_i = 0
        for i in range(len(intervalStart)):
            for j in range(len(xData)):
                if (xData[j] >= intervalStart[i]) and (xData[j] <= intervalEnd[i]):
                    n_i += nData[j]
            n.append(n_i)
            n_i = 0
        
        df = pd.DataFrame(zip(intervalStart, intervalEnd, n), columns=['start', 'end', 'n_i'])
        df['w_i'] = df['n_i'].apply(lambda n_i: n_i/self.n)
        df['w_i/h'] = df['w_i'].apply(lambda n_i: round(n_i/h, 3))
            
        return df
    
    
    def half_series(self):
        xStartData = self.interval_series()['start'].tolist()
        xEndData = self.interval_series()['end'].tolist()
        nData = self.interval_series()['n_i'].tolist()
        x = []
        for i in range(len(xStartData)):
            x.append((xStartData[i] + xEndData[i]) / 2)
        df = pd.DataFrame(zip(x, nData), columns=['x_i', 'n_i'])
        return df
    
    
    
    def S(self):
        S2 = self.half_series()['x_i'].apply(lambda x: (x - self.discrete_sample_mean())**2)
        S2 = S2 * self.half_series()['n_i']
        S2 = S2.sum() * 1 / (self.n - 1)
        S = np.sqrt(S2)
        
        return S
                                             
    def a(self):
        return self.discrete_sample_mean() - np.sqrt(3) * self.S()
    
    def b(self):
        return self.discrete_sample_mean() + np.sqrt(3) * self.S()
    
    
    def p_list(self):
        n = self.interval_series()['n_i'].tolist()
        xStartData = self.interval_series()['start'].tolist()
        xEndData = self.interval_series()['end'].tolist()
        
        p = []
        p.append(0.082344)
        for i in range(1, len(n) - 1):
            p.append((xEndData[i] - xStartData[i])/(self.b() - self.a()))
        p.append(0.073066)
        
        return pd.Series(p)
    
    
    def xi_list(self):
        p = self.p_list().tolist()
        n = self.interval_series()['n_i'].tolist()
        xi = []
        for i in range(len(n)):
            xi.append((n[i] - self.n * p[i])**2 / (self.n * p[i]))

        return pd.Series(xi)
    
    def xi2(self):
        return sum(self.xi_list())
    

In [111]:
inputData = pd.read_csv(r'D:/kpi/Statistica/CalcAndGraphic/Work 2/data.csv')
Data1 = Data(inputData)

In [100]:
Data1.variational_series()

Unnamed: 0,x_i,n_i,w_i
6,0.56,2,0.066667
7,0.57,2,0.066667
8,0.58,2,0.066667
2,0.59,3,0.1
0,0.6,4,0.133333
1,0.61,4,0.133333
3,0.62,3,0.1
9,0.63,2,0.066667
4,0.64,3,0.1
10,0.65,2,0.066667


In [101]:
Data1.discrete_sample_mean()

0.6116666666666666

In [102]:
Data1.discrete_D()

0.0008672222222222229

In [103]:
Data1.discrete_standard_error()

0.02944863701807306

In [104]:
Data1.interval(1.96)

[0.6011286072487141, 0.6222047260846191]

In [105]:
Data1.interval_series()

Unnamed: 0,start,end,n_i,w_i,w_i/h
0,0.56,0.5685,2,0.066667,3.922
1,0.5685,0.5855,4,0.133333,7.843
2,0.5855,0.6025,7,0.233333,13.725
3,0.6025,0.6195,4,0.133333,7.843
4,0.6195,0.6365,5,0.166667,9.804
5,0.6365,0.6535,5,0.166667,9.804
6,0.6535,0.66,3,0.1,5.882


In [106]:
Data1.half_series()

Unnamed: 0,x_i,n_i
0,0.56425,2
1,0.577,4
2,0.594,7
3,0.611,4
4,0.628,5
5,0.645,5
6,0.65675,3


In [107]:
Data1.a()

0.5613463350316101

In [108]:
Data1.b()

0.661986998301723

In [109]:
Data1.p_list()

0    0.082344
1    0.168918
2    0.168918
3    0.168918
4    0.168918
5    0.168918
6    0.073066
dtype: float64

In [112]:
Data1.xi_list()

0    0.089543
1    0.224888
2    0.736931
3    0.224888
4    0.000900
5    0.000900
6    0.297857
dtype: float64

In [113]:
Data1.xi2()

1.5759081843008405