# **Student’s t-Test**

In [5]:
from math import sqrt
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from scipy.stats import sem
from scipy.stats import t
import pandas as pd

In [6]:
def independent_ttest(data1, data2, alpha):
	# calculate means
	mean1, mean2 = mean(data1), mean(data2)
	# calculate standard errors
	se1, se2 = sem(data1), sem(data2)
	# standard error on the difference between the samples
	sed = sqrt(se1**2.0 + se2**2.0)
	# calculate the t statistic
	t_stat = (mean1 - mean2) / sed
	# degrees of freedom
	df = len(data1) + len(data2) - 2
	# calculate the critical value
	cv = t.ppf(1.0 - alpha, df)
	# calculate the p-value
	p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
	# return everything
	return t_stat, df, cv, p

In [16]:
def csv_to_dict(filename):
  data = pd.read_csv(filename+'.csv')

  et = data.loc[(data['Algoritmo']=='ET')]['F1'].values.tolist()
  lr = data.loc[(data['Algoritmo']=='LR')]['F1'].values.tolist()
  mlp = data.loc[(data['Algoritmo']=='MLP')]['F1'].values.tolist()
  mnb = data.loc[(data['Algoritmo']=='MNB')]['F1'].values.tolist()
  pa = data.loc[(data['Algoritmo']=='PA')]['F1'].values.tolist()
  sgd = data.loc[(data['Algoritmo']=='SGD')]['F1'].values.tolist()
  svm = data.loc[(data['Algoritmo']=='SVM')]['F1'].values.tolist()
                
  return {'ET':et, 'LR':lr, 'MLP':mlp, 'MNB':mnb, 'PA':pa, 'SGD':sgd, 'SVM':svm}

In [25]:
origin_dict = csv_to_dict('general_cv_mean(origin)')

def generate_hypothesis(values):
  # seed the random number generator
  seed(1)

  algs = ['ET', 'LR', 'MLP', 'MNB', 'PA', 'SGD', 'SVM']

  for alg in algs:
    data1 = values[alg]
    data2 = origin_dict[alg]

    print(alg)
    # calculate the t test
    alpha = 0.05
    t_stat, df, cv, p = independent_ttest(data1, data2, alpha)
    print('t=%.3f, df=%d, cv=%.3f, p=%.3f' % (t_stat, df, cv, p))
    # interpret via critical value
    if abs(t_stat) <= cv:
      print('Accept null hypothesis that the means are equal.')
    else:
      print('Reject the null hypothesis that the means are equal.')
    # interpret via p-value
    if p > alpha:
      print('Accept null hypothesis that the means are equal.')
    else:
      print('Reject the null hypothesis that the means are equal.')

### **Tomek Hypothesis**

In [32]:
tomek_dict = csv_to_dict('general_cv_mean(tomek)')
generate_hypothesis(tomek_dict)

ET
t=-1.572, df=12, cv=1.782, p=0.142
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
LR
t=-2.855, df=12, cv=1.782, p=0.014
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
MLP
t=-2.070, df=12, cv=1.782, p=0.061
Reject the null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MNB
t=-0.784, df=12, cv=1.782, p=0.448
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
PA
t=-2.225, df=12, cv=1.782, p=0.046
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
SGD
t=-1.856, df=12, cv=1.782, p=0.088
Reject the null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SVM
t=-1.402, df=12, cv=1.782, p=0.186
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.


### **ADASYN Hypothesis**

In [28]:
adasyn_dict = csv_to_dict('general_cv_mean(adasyn)')
generate_hypothesis(adasyn_dict)

ET
t=6.759, df=12, cv=1.782, p=0.000
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
LR
t=0.253, df=12, cv=1.782, p=0.804
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MLP
t=1.414, df=12, cv=1.782, p=0.183
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MNB
t=-1.373, df=12, cv=1.782, p=0.195
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
PA
t=0.425, df=12, cv=1.782, p=0.678
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SGD
t=1.905, df=12, cv=1.782, p=0.081
Reject the null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SVM
t=-2.473, df=12, cv=1.782, p=0.029
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.


### **SMOTE Hypothesis**

In [29]:
smote_dict = csv_to_dict('general_cv_mean(smote)')
generate_hypothesis(smote_dict)

ET
t=4.146, df=12, cv=1.782, p=0.001
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
LR
t=0.015, df=12, cv=1.782, p=0.989
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MLP
t=1.262, df=12, cv=1.782, p=0.231
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MNB
t=-1.433, df=12, cv=1.782, p=0.177
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
PA
t=0.196, df=12, cv=1.782, p=0.848
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SGD
t=1.348, df=12, cv=1.782, p=0.203
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SVM
t=-3.005, df=12, cv=1.782, p=0.011
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.


### **BD-SMOTE Hypothesis**

In [30]:
bdsmote_dict = csv_to_dict('general_cv_mean(bdsmote)')
generate_hypothesis(bdsmote_dict)

ET
t=4.522, df=12, cv=1.782, p=0.001
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
LR
t=0.253, df=12, cv=1.782, p=0.805
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MLP
t=3.320, df=12, cv=1.782, p=0.006
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
MNB
t=-1.370, df=12, cv=1.782, p=0.196
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
PA
t=0.983, df=12, cv=1.782, p=0.345
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SGD
t=1.294, df=12, cv=1.782, p=0.220
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SVM
t=-2.057, df=12, cv=1.782, p=0.062
Reject the null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.


### **SMOTE-TL Hypothesis**

In [31]:
smotetomek_dict = csv_to_dict('general_cv_mean(smotetomek)')
generate_hypothesis(smotetomek_dict)

ET
t=4.063, df=12, cv=1.782, p=0.002
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
LR
t=-0.229, df=12, cv=1.782, p=0.823
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
MLP
t=3.344, df=12, cv=1.782, p=0.006
Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.
MNB
t=-1.426, df=12, cv=1.782, p=0.179
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
PA
t=0.804, df=12, cv=1.782, p=0.437
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SGD
t=2.027, df=12, cv=1.782, p=0.066
Reject the null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.
SVM
t=-1.422, df=12, cv=1.782, p=0.180
Accept null hypothesis that the means are equal.
Accept null hypothesis that the means are equal.


## **Referências**

https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/

https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f