# **Student’s t-Test**

In [1]:
from math import sqrt
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from scipy.stats import sem
from scipy.stats import t
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')
path_drive = '/content/drive/MyDrive/PIBIC/results/'

Mounted at /content/drive


In [3]:
def independent_ttest(data1, data2, alpha):
	# calculate means
	mean1, mean2 = mean(data1), mean(data2)
	# calculate standard errors
	se1, se2 = sem(data1), sem(data2)
	# standard error on the difference between the samples
	sed = sqrt(se1**2.0 + se2**2.0)
	# calculate the t statistic
	t_stat = (mean1 - mean2) / sed
	# degrees of freedom
	df = len(data1) + len(data2) - 2
	# calculate the critical value
	cv = t.ppf(1.0 - alpha, df)
	# calculate the p-value
	p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
	# return everything
	return t_stat, df, cv, p

In [4]:
def csv_to_dict(filename, metric):
  data = pd.read_csv(filename+'.csv')

  adamax = data.loc[(data['Optimizer']=='adamax')][metric].values.tolist()
  rmsprop = data.loc[(data['Optimizer']=='rmsprop')][metric].values.tolist()
  adam = data.loc[(data['Optimizer']=='adam')][metric].values.tolist()
                
  return {'adamax':adamax, 'rmsprop':rmsprop, 'adam':adam}

In [15]:
origin_dict = csv_to_dict(path_drive+'general_cv_mean(origin)', 'F1')

def generate_hypothesis(values):
  # seed the random number generator
  seed(1)

  algs = ['adamax', 'rmsprop', 'adam']

  for alg in algs:
    data1 = values[alg]
    data2 = origin_dict[alg]

    print(alg)
    # calculate the t test
    alpha = 0.05
    t_stat, df, cv, p = independent_ttest(data1, data2, alpha)
    print('t=%.3f, df=%d, cv=%.3f, p=%.3f' % (t_stat, df, cv, p))
    # interpret via critical value
    # if abs(t_stat) <= cv:
    #   print('Accept null hypothesis that the means are equal.')
    # else:
    #   print('Reject the null hypothesis that the means are equal.')
    # interpret via p-value
    if p > alpha:
      print('Accept null hypothesis that the means are equal.')
    else:
      print('Reject the null hypothesis that the means are equal.')

### **Tomek Hypothesis**

In [16]:
tomek_dict = csv_to_dict(path_drive+'general_cv_mean(tomek)', 'F1')
generate_hypothesis(tomek_dict)

adamax
t=11.459, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
rmsprop
t=11.054, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
adam
t=10.012, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.


### **ADASYN Hypothesis**

In [7]:
adasyn_dict = csv_to_dict(path_drive+'general_cv_mean(adasyn)', 'F1')
generate_hypothesis(adasyn_dict)

adamax
t=11.829, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
rmsprop
t=8.475, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
adam
t=12.767, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.


### **SMOTE Hypothesis**

In [8]:
smote_dict = csv_to_dict(path_drive+'general_cv_mean(smote)', 'F1')
generate_hypothesis(smote_dict)

adamax
t=9.469, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
rmsprop
t=9.760, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
adam
t=9.990, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.


### **BD-SMOTE Hypothesis**

In [9]:
bdsmote_dict = csv_to_dict(path_drive+'general_cv_mean(bdsmote)', 'F1')
generate_hypothesis(bdsmote_dict)

adamax
t=4.328, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
rmsprop
t=8.349, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
adam
t=5.335, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.


### **SMOTE-TL Hypothesis**

In [10]:
smotetomek_dict = csv_to_dict(path_drive+'general_cv_mean(smotetomek)', 'F1')
generate_hypothesis(smotetomek_dict)

adamax
t=8.132, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
rmsprop
t=6.322, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.
adam
t=8.517, df=18, cv=1.734, p=0.000
Reject the null hypothesis that the means are equal.


## **Referências**

https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/

https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f