In [1]:
# Import needed library
import numpy as np
import pandas as pd
import scipy.stats as st

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Function to retrieve critical value from z-table
def get_z_table(alpha, tail):
  if tail == 1:
    return st.norm.ppf(1 - alpha)
  elif tail == 2:
    return st.norm.ppf(1 - (alpha / 2))

# Function to retrieve critical value from t-table
def get_t_table(alpha, tail, dof):
  if tail == 2:
    alpha /= 2  
  return st.t.isf([alpha], [dof])[0]

In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/LouisFernando1204/InformaticsStudentsDataset/main/informaticsstudents_dataset")
# df.head()

# TESTING
df_Male = df.loc[df['Gender'] == 'Male']
df_NumberofCompetitions_Male = df_Male['Number of Competitions']
df_Female = df.loc[df['Gender'] == 'Female']
df_NumberofCompetitions_Female = df_Female['Number of Competitions']
alpha_levelSignificance = 0.05
tailType = 2
Xbar_Male = np.mean(df_NumberofCompetitions_Male)
Xbar_Female = np.mean(df_NumberofCompetitions_Female)

print(Xbar_Male, Xbar_Female)


1.9710144927536233 2.084507042253521


Among the vibrant community of Informatics students at ITS, there exists a notable cohort consistently 
achieving a GPA above 3.5. This observation suggests that their academic acumen surpasses the conventional 
threshold, reflecting a culture of excellence within the department.

In [9]:
# Step 1: State null and alternate hypothesis
# H0: Miu <= 3.5
# H1: Miu > 3.5


# Step 2: Select level of significance
alpha_levelSignificance= 0.05
tailType = 1
df_GPA = df['GPA']
Xbar_sampleMean = df_GPA.mean()
Miu_populationMean = 3.5

std_dev_sample = np.std(df_GPA, ddof = 1)
# The standard is directly divided by N, but because we want to calculate the std for the sample, 
# we have to divide by n-1

n_sampleSize = df_GPA.count()
degree_of_freedom = n_sampleSize - 1


# Step 3: Identify the test statistics
# Using the T-Test because the standard deviation for the population is unknown


# Step 4: Formulate a decision rule by first determining the critical values of T.
# Reject H0 if t_computed > t_tableResult(alpha, n-1)
# If the t_computed result is more than t_tableResult then reject H0 (H1 / Alternative Hypothesis is proven), 
# but if t_computed is less than t_tableResult then don't reject H0 (H1 / Alternative Hypothesis is not proven)


# Step 5: Take a sample, arrive at decision
t_computed = (Xbar_sampleMean-Miu_populationMean) / (std_dev_sample / np.sqrt(n_sampleSize))
t_tableResult = get_t_table(alpha_levelSignificance, tailType, degree_of_freedom)
print("t_computed = {:.2f}".format(t_computed))
print("t_tableResult = {:.2f}".format(t_tableResult))


# Step 6: Interpret the result
if t_computed > t_tableResult:
  print("H0 rejected. The average GPA of all Informatics Students at ITS is more than 3.5 GPA.")
else:
  print("Fail to Reject H0. The average GPA of all Informatics Students at ITS is not more than 3.5 GPA.")

t_computed = -1.20
t_tableResult = 1.66
Fail to Reject H0. The average GPA of all Informatics Students at ITS is not more than 3.5 GPA.


Amidst the academic endeavors at Institut Teknologi Sepuluh Nopember (ITS), an unspoken consensus emerges regarding the duration of independent study among Informatics students. It hints that the average hours dedicated to independent study surpasses the conventional expectation of three hours.

In [10]:
# Step 1: State null and alternate hypothesis
# H0: Miu <= 3
# H1: Miu > 3


# Step 2: Select level of significance
alpha_levelSignificance= 0.05
tailType = 1
df_HoursofIndependentStudy = df['Hours of Independent Study (per day)']
Xbar_sampleMean = df_HoursofIndependentStudy.mean()
Miu_populationMean = 3

std_dev_sample = np.std(df_HoursofIndependentStudy, ddof = 1)
# The standard is directly divided by N, but because we want to calculate the std for the sample, 
# we have to divide by n-1

n_sampleSize = df_HoursofIndependentStudy.count()
degree_of_freedom = n_sampleSize - 1


# Step 3: Identify the test statistics
# Using the T-Test because the standard deviation for the population is unknown


# Step 4: Formulate a decision rule by first determining the critical values of T.
# Reject H0 if t_computed > t_tableResult(alpha, n-1)
# If the t_computed result is more than t_tableResult then reject H0 (H1 / Alternative Hypothesis is proven), 
# but if t_computed is less than t_tableResult then don't reject H0 (H1 / Alternative Hypothesis is not proven)


# Step 5: Take a sample, arrive at decision
t_computed = (Xbar_sampleMean-Miu_populationMean) / (std_dev_sample / np.sqrt(n_sampleSize))
t_tableResult = get_t_table(alpha_levelSignificance, tailType, degree_of_freedom)
print("t_computed = {:.2f}".format(t_computed))
print("t_tableResult = {:.2f}".format(t_tableResult))


# Step 6: Interpret the result
if t_computed > t_tableResult:
  print("H0 rejected. The average Hours of Independent Study (per day) of all Informatics Students is more than 3 hours.")
else:
  print("Fail to Reject H0. The average Hours of Independent Study (per day) of all Informatics Students is not more than 3 hours.")

t_computed = 3.50
t_tableResult = 1.66
H0 rejected. The average Hours of Independent Study (per day) of all Informatics Students is more than 3 hours.


Within the academic fabric of Institut Teknologi Sepuluh Nopember (ITS), the cohort from the class of 2019 stands as a testament to excellence in Informatics. Their collective academic performance indicates that the average grade of the class of 2019 surpasses that of the class of 2022.

In [11]:
# Step 1: State the null and alternate hypothesis
# H0: Miu_2019 <= Miu_2022
# H1: Miu_2019 > Miu_2022


# Step 2: Select the level of significance
df_2019 = df.loc[df['Class Year'] == 2019]
df_AverageGrade_2019 = df_2019['Average Grade in Informatics']
df_2022 = df.loc[df['Class Year'] == 2022]
df_AverageGrade_2022 = df_2022['Average Grade in Informatics']
alpha_levelSignificance = 0.05
tailType = 1
Xbar_2019 = np.mean(df_AverageGrade_2019)
Xbar_2022 = np.mean(df_AverageGrade_2022)

# For sample standard deviation in numpy we need to supply delta degree of freedom (ddof) with 1
std_dev_sample_2019 = np.std(df_AverageGrade_2019, ddof=1)
std_dev_sample_2022 = np.std(df_AverageGrade_2022, ddof=1)
# The standard is directly divided by N, but because we want to calculate the std for the sample, 
# we have to divide by n-1

n_2019 = df_AverageGrade_2019.count()
n_2022 = df_AverageGrade_2022.count()
degreeOfFreedom = n_2019 + n_2022 - 2

std_dev_pooled_sample = ((n_2019 - 1) * np.power(std_dev_sample_2019, 2) + 
                        (n_2022 - 1) * np.power(std_dev_sample_2022, 2)) / degreeOfFreedom
print("Standard deviation of the pooled sample = {:.4f}".format(std_dev_pooled_sample))


# Step 3: Determine the test statistics 
# Independent Samples
# Because the population standard deviations are not known but are assumed to be equal, 
# we use the pooled t-test


# Step 4: Formulate the decision rule
# Reject H0 if t_computed > t_tableResult(alpha, n-1)
# If the t_computed result is more than t_tableResult then reject H0 (H1 / Alternative Hypothesis is proven), 
# but if t_computed is less than t_tableResult then don't reject H0 (H1 / Alternative Hypothesis is not proven)


# Step 5: Make decision regarding H0
t_computed = (Xbar_2019 - Xbar_2022) / np.sqrt(std_dev_pooled_sample * (1/n_2019 + 1/n_2022))
t_tableResult = get_t_table(alpha_levelSignificance, tailType, degreeOfFreedom)
print("t_computed = {:.3f}".format(t_computed))
print("t_tableResult = {:.3f}".format(t_tableResult))


# Step 6: Interpret the result
if t_computed > t_tableResult:
  print("H0 rejected. The average of Average Grade of class year 2019 is more than class year of 2022.")
else:
  print("Fail to Reject H0. The average of Average Grade of class year 2019 is not more than class year of 2022.")

Standard deviation of the pooled sample = 16.0916
t_computed = -0.983
t_tableResult = 1.668
Fail to Reject H0. The average of Average Grade of class year 2019 is not more than class year of 2022.


In the diverse landscape of Informatics at Institut Teknologi Sepuluh Nopember (ITS), male and female students exhibit varied engagement in competitions. This suggests that the average number of competitions participated in by male students differs significantly from that of female students.

In [14]:
# Step 1: State the null and alternate hypothesis
# H0: Miu_Male = Miu_Female
# H1: Miu_Male != Miu_Female


# Step 2: Select the level of significance
df_Male = df.loc[df['Gender'] == 'Male']
df_NumberofCompetitions_Male = df_Male['Number of Competitions']
df_Female = df.loc[df['Gender'] == 'Female']
df_NumberofCompetitions_Female = df_Female['Number of Competitions']
alpha_levelSignificance = 0.05
tailType = 2
Xbar_Male = np.mean(df_NumberofCompetitions_Male)
Xbar_Female = np.mean(df_NumberofCompetitions_Female)

# For sample standard deviation in numpy we need to supply delta degree of freedom (ddof) with 1
std_dev_sample_Male = np.std(df_NumberofCompetitions_Male, ddof=1)
std_dev_sample_Female = np.std(df_NumberofCompetitions_Female, ddof=1)
# The standard is directly divided by N, but because we want to calculate the std for the sample, 
# we have to divide by n-1

n_Male = df_NumberofCompetitions_Male.count()
n_Female = df_NumberofCompetitions_Female.count()
degreeOfFreedom = n_Male + n_Female - 2

std_dev_pooled_sample = ((n_Male - 1) * np.power(std_dev_sample_Male, 2) + 
                        (n_Female - 1) * np.power(std_dev_sample_Female, 2)) / degreeOfFreedom
print("Standard deviation of the pooled sample = {:.4f}".format(std_dev_pooled_sample))


# Step 3: Determine the test statistics 
# Independent Samples
# Because the population standard deviations are not known but are assumed to be equal, 
# we use the pooled t-test


# Step 4: Formulate the decision rule
# Reject H0 if t_computed > t_tableResult atau t_computed < -t_tableResult (H1 / Alternative Hypothesis is proven)
# Alternatively we could use the absolute value of |t_computed| > t_tableResult (alpha/2, n-1)


# Step 5: Make decision regarding H0
t_computed = (Xbar_Male - Xbar_Female) / np.sqrt(std_dev_pooled_sample * (1/n_Male + 1/n_Female))
t_tableResult = get_t_table(alpha_levelSignificance, tailType, degreeOfFreedom)
print("t_computed = {:.3f}".format(t_computed))
print("t_tableResult = {:.3f}".format(t_tableResult))


# Step 6: Interpret the result
if (t_computed > t_tableResult) | (t_computed < -t_tableResult):
  print("H0 rejected. There is a difference average Number of Competitions between male and female students.")
else:
  print("Fail to Reject H0. There is no difference average Number of Competitions between male and female students.")

Standard deviation of the pooled sample = 0.5466
t_computed = -0.908
t_tableResult = 1.977
Fail to Reject H0. There is no difference average Number of Competitions between male and female students.
