In [99]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import scipy.integrate as integrate
import scipy.special as special

In [102]:
class Data:
    def __init__(self, inputData):
        self.dataSeries = inputData
        self.n = inputData.size
        
        
    def variational_series(self):
        variational_series = self.dataSeries.value_counts().rename_axis('x_i').reset_index(name='n_i')
        variational_series = variational_series.sort_values(['x_i'])
        variational_series['w_i'] = variational_series['n_i'].apply(lambda n_i: n_i/self.n)
        return variational_series
    
    
    def discrete_sample_mean(self):
        xData = self.variational_series()['x_i'].tolist()
        nData = self.variational_series()['n_i'].tolist()

        res = 0
        for i in range(len(xData)):
            res += xData[i] * nData[i]
        res /= self.n

        return res


    def discrete_D(self):
            xData = self.variational_series()['x_i'].tolist()
            nData = self.variational_series()['n_i'].tolist()
            x_s = self.discrete_sample_mean()

            res = 0
            for i in range(len(xData)):
                res += (xData[i] - x_s) * (xData[i] - x_s) * nData[i]
            res /= self.n

            return res
    
    
    def discrete_standard_error(self):
        return np.sqrt(self.discrete_D())
    
    
    def big_F(self, x):
        res = integrate.quad(lambda z: np.power(np.e, -z * z / 2), 0, x)[0]
        res *= 1 / np.sqrt(2 * np.pi)
        return res
    
    
    def interval(self, x):  #3.29
        interval = []
        temp = self.discrete_sample_mean() - self.discrete_standard_error() * x / np.sqrt(self.n)
        interval.append(temp)
        temp = self.discrete_sample_mean() + self.discrete_standard_error() * x / np.sqrt(self.n)
        interval.append(temp)
        
        return interval
    
    
    def interval_series(self):
        xData = self.variational_series()['x_i'].tolist()
        nData = self.variational_series()['n_i'].tolist()
        
        k = 1 + 3.322 * np.log10(self.n)
        h = round((xData[-1] - xData[0]) / k, 3)
        value = xData[0] - h / 2
        
        intervalStart = []
        intervalEnd = []
        n = []
        
        while True:
            intervalStart.append(round(value, 4))
            value += h
            intervalEnd.append(round(value, 4))
            
            if value >= xData[-1]:
                break
        intervalStart[0] = xData[0]
        intervalEnd[-1] = xData[-1]
        
        n_i = 0
        for i in range(len(intervalStart)):
            for j in range(len(xData)):
                if (xData[j] >= intervalStart[i]) and (xData[j] <= intervalEnd[i]):
                    n_i += nData[j]
            n.append(n_i)
            n_i = 0
        
        df = pd.DataFrame(zip(intervalStart, intervalEnd, n), columns=['start', 'end', 'n_i'])
        df['w_i'] = df['n_i'].apply(lambda n_i: n_i/self.n)
        df['w_i/h'] = df['w_i'].apply(lambda n_i: round(n_i/h, 3))
            
        return df
    
    
    def half_series(self):
        xStartData = self.interval_series()['start'].tolist()
        xEndData = self.interval_series()['end'].tolist()
        nData = self.interval_series()['n_i'].tolist()
        x = []
        for i in range(len(xStartData)):
            x.append((xStartData[i] + xEndData[i]) / 2)
        df = pd.DataFrame(zip(x, nData), columns=['x_i', 'n_i'])
        return df
    
    
    
    def S(self):
        S2 = self.half_series()['x_i'].apply(lambda x: (x - self.discrete_sample_mean())**2)
        S2 = S2 * self.half_series()['n_i']
        S2 = S2.sum() * 1 / (self.n - 1)
        S = np.sqrt(S2)
        return S
                                             
    def a(self):
        return self.discrete_sample_mean() - np.sqrt(3) * self.S()
    
    def b(self):
        return self.discrete_sample_mean() + np.sqrt(3) * self.S()
    
    def discrete_sample_mean(self):
        xData = self.variational_series()['x_i'].tolist()
        nData = self.variational_series()['n_i'].tolist()

        res = 0
        for i in range(len(xData)):
            res += xData[i] * nData[i]
        res /= self.n

        return res
    
    def half_series(self):
        xStartData = self.interval_series()['start'].tolist()
        xEndData = self.interval_series()['end'].tolist()
        nData = self.interval_series()['n_i'].tolist()
        x = []
        for i in range(len(xStartData)):
            x.append((xStartData[i] + xEndData[i]) / 2)
        df = pd.DataFrame(zip(x, nData), columns=['x_i', 'n_i'])
        return df
    
    
    def interval_sample_mean(self):
        xData = self.half_series()['x_i'].tolist()
        nData = self.half_series()['n_i'].tolist()
        
        res = 0
        for i in range(len(xData)):
            res += xData[i] * nData[i]
        res /= self.n
        
        return res
    
        
    def interval_D(self):
        xData = self.half_series()['x_i'].tolist()
        nData = self.half_series()['n_i'].tolist()
        x_s = self.interval_sample_mean()
        
        res = 0
        for i in range(len(xData)):
            res += (xData[i] - x_s) * (xData[i] - x_s) * nData[i]
        res /= self.n
        
        return res
    
    def interval_standard_error(self):
        return np.sqrt(self.interval_D())
    
    
    def big_F(self, x):
        res = integrate.quad(lambda z: np.power(np.e, -z * z / 2), 0, x)[0]
        res *= 1 / np.sqrt(2 * np.pi)
        return res
    

    def z_tab(self):
        intervalStart = self.interval_series()['start'].tolist()
        intervalEnd = self.interval_series()['end'].tolist()
        
        z1 = map(lambda x: ((x - self.interval_sample_mean())
                            / self.interval_standard_error()), intervalStart)
        
        z2 = map(lambda x: ((x - self.interval_sample_mean())
                            / self.interval_standard_error()), intervalEnd)
        
        tab = pd.DataFrame(zip(z2, z1), columns=['z1_i', 'z2_i'])
        return tab
        
    def p_list(self):
        z1 = self.z_tab()['z1_i'].tolist()
        z2 = self.z_tab()['z2_i'].tolist()
        p = []
        p.append(self.big_F(z1[0]) + 0.5)
        for i in range(1, len(z1) - 1):
            p.append(self.big_F(z1[i]) - self.big_F(z2[i]))
        p.append(0.5 - self.big_F(z2[6]))
        
        return pd.Series(p)
    
    def xi_list(self):
        p = self.p_list().tolist()
        n = self.interval_series()['n_i'].tolist()
        xi = []
        for i in range(len(n)):
            xi.append((n[i] - self.n * p[i])**2 / (self.n * p[i]))
        
        return pd.Series(xi)
    
    def xi2(self):
        return sum(self.xi_list())
    

In [103]:
inputData = pd.read_csv(r'C:\Users\charl\Downloads\Statistica\CalcAndGraphic\Work 2\data.csv')
FuelWaste = Data(inputData)

In [104]:
FuelWaste.variational_series()

Unnamed: 0,x_i,n_i,w_i
13,6500,1,0.033333
15,6530,1,0.033333
2,6540,2,0.066667
21,6620,1,0.033333
20,6650,1,0.033333
19,6660,1,0.033333
18,6670,1,0.033333
17,6690,1,0.033333
4,6700,2,0.066667
16,6710,1,0.033333


In [113]:
FuelWaste.discrete_sample_mean()

6761.666666666667

In [114]:
FuelWaste.discrete_D()

19760.55555555555

In [115]:
FuelWaste.discrete_standard_error()

140.57224319030962

In [117]:
FuelWaste.interval(3.29)

[6677.229267885304, 6846.10406544803]

In [151]:
FuelWaste.interval_series()

Unnamed: 0,start,end,n_i,w_i,w_i/h
0,6500.0,6542.3225,4,0.133333,0.002
1,6542.3225,6626.9675,1,0.033333,0.0
2,6626.9675,6711.6125,7,0.233333,0.003
3,6711.6125,6796.2575,6,0.2,0.002
4,6796.2575,6880.9025,4,0.133333,0.002
5,6880.9025,6965.5475,5,0.166667,0.002
6,6965.5475,7000.0,3,0.1,0.001


In [152]:
FuelWaste.half_series()

Unnamed: 0,x_i,n_i
0,6521.16125,4
1,6584.645,1
2,6669.29,7
3,6753.935,6
4,6838.58,4
5,6923.225,5
6,6982.77375,3


In [153]:
FuelWaste.a()

6507.670865104819

In [154]:
FuelWaste.b()

7015.662468228515

In [148]:
FuelWaste.p_list()

0    0.065635
1    0.112632
2    0.190596
3    0.230716
4    0.199793
5    0.123766
6    0.076861
dtype: float64

In [143]:
FuelWaste.xi_list()

0    2.094830
1    1.674907
2    0.287488
3    0.122681
4    0.663228
5    0.446106
6    0.208975
dtype: float64

In [144]:
FuelWaste.xi2()

5.4982145321672045