<a href="https://colab.research.google.com/github/Kiron-Ang/DSC/blob/main/basic_statistical_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check the current version of Python installed on the system
!python -V

# Import the pandas library for data manipulation and analysis
import pandas as pd

# Read the CSV file from the given URL and load it into a pandas DataFrame
# The URL points to a raw CSV file hosted on GitHub
df = pd.read_csv("https://github.com/Kiron-Ang/DSC/blob/main/schedule_of_classes.csv?raw=true")

# Display the contents of the DataFrame to view the data
df

Python 3.10.12


Unnamed: 0,prefix,course,section,max_enroll,seats_avail,waitlist
0,ACC,2301,01,50,0,6.0
1,ACC,2301,02,50,0,2.0
2,ACC,2301,03,50,0,1.0
3,ACC,2301,W1,40,1,5.0
4,ACC,2303,01,200,46,0.0
...,...,...,...,...,...,...
4609,THEA,4V9R,01,19,15,0.0
4610,THEA,4VC5,01,15,14,0.0
4611,THEA,4VC5,02,15,14,0.0
4612,UNSC,3001,U1,65,61,0.0


In [7]:
import pandas as pd
import numpy as np
from scipy import stats

# Step 1: Filter the DataFrame
df_filtered = df[~df['prefix'].str.contains('HON')]  # Remove 'HON' prefix
df_filtered = df_filtered[~df_filtered['course'].str.contains('[A-Za-z]', regex=True)]  # Remove rows with letters in course
df_filtered = df_filtered[df_filtered['max_enroll'].notna() & (df_filtered['max_enroll'] != 0)]  # Remove rows with max_enroll == 0 or empty

# Step 2: Remove any rows with NA values
df_filtered = df_filtered.dropna()  # Remove rows with any NA values

# Display some rows of the filtered DataFrame for inspection
print("Filtered DataFrame (First few rows):")
print(df_filtered.head())

# Step 3: Define function for testing each column
def perform_stat_tests(df_filtered, column):
    # Extract rows for DSC and CSI
    df_dsc = df_filtered[df_filtered['prefix'] == 'DSC']
    df_csi = df_filtered[df_filtered['prefix'] == 'CSI']

    # Null and alternative hypotheses
    print(f"\nPerforming statistical tests for column: {column}")

    # Shapiro-Wilk test for normality for DSC and CSI groups
    normal_dsc = stats.shapiro(df_dsc[column])
    normal_csi = stats.shapiro(df_csi[column])

    print(f"Shapiro-Wilk test results: ")
    print(f"  DSC normality p-value: {normal_dsc.pvalue}")
    print(f"  CSI normality p-value: {normal_csi.pvalue}")

    # Check if the data is normal (Shapiro-Wilk)
    if normal_dsc.pvalue < 0.05 or normal_csi.pvalue < 0.05:
        # Use Mann-Whitney U test for non-parametric data
        test_statistic, p_value_dsc_csi = stats.mannwhitneyu(df_dsc[column], df_csi[column])
        print(f"Test for DSC vs CSI: Mann-Whitney U test statistic: {test_statistic}, p-value: {p_value_dsc_csi}")
    else:
        # Use independent t-test if normal
        test_statistic, p_value_dsc_csi = stats.ttest_ind(df_dsc[column], df_csi[column])
        print(f"Test for DSC vs CSI: T-test statistic: {test_statistic}, p-value: {p_value_dsc_csi}")

    # Step 4: DSC vs all other rows
    df_other = df_filtered[df_filtered['prefix'] != 'DSC']

    # Shapiro-Wilk test for normality for DSC and other groups
    normal_other = stats.shapiro(df_other[column])

    print(f"Shapiro-Wilk test results for DSC vs All Other Rows:")
    print(f"  DSC normality p-value: {normal_dsc.pvalue}")
    print(f"  Other groups normality p-value: {normal_other.pvalue}")

    # Check if the data is normal (Shapiro-Wilk)
    if normal_dsc.pvalue < 0.05 or normal_other.pvalue < 0.05:
        # Use Mann-Whitney U test for non-parametric data
        test_statistic, p_value_dsc_other = stats.mannwhitneyu(df_dsc[column], df_other[column])
        print(f"Test for DSC vs All Other Rows: Mann-Whitney U test statistic: {test_statistic}, p-value: {p_value_dsc_other}")
    else:
        # Use independent t-test if normal
        test_statistic, p_value_dsc_other = stats.ttest_ind(df_dsc[column], df_other[column])
        print(f"Test for DSC vs All Other Rows: T-test statistic: {test_statistic}, p-value: {p_value_dsc_other}")

# Step 5: Apply the tests for 'max_enroll', 'seats_avail', and 'waitlist'
for column in ['max_enroll', 'seats_avail', 'waitlist']:
    perform_stat_tests(df_filtered, column)


Filtered DataFrame (First few rows):
  prefix course section  max_enroll  seats_avail  waitlist
0    ACC   2301      01          50            0       6.0
1    ACC   2301      02          50            0       2.0
2    ACC   2301      03          50            0       1.0
3    ACC   2301      W1          40            1       5.0
4    ACC   2303      01         200           46       0.0

Performing statistical tests for column: max_enroll
Shapiro-Wilk test results: 
  DSC normality p-value: 0.12046825535522193
  CSI normality p-value: 0.00012949859674982889
Test for DSC vs CSI: Mann-Whitney U test statistic: 179.0, p-value: 0.15851067952634035
Shapiro-Wilk test results for DSC vs All Other Rows:
  DSC normality p-value: 0.12046825535522193
  Other groups normality p-value: 9.274481339271174e-71
Test for DSC vs All Other Rows: Mann-Whitney U test statistic: 15723.5, p-value: 0.18513482443640317

Performing statistical tests for column: seats_avail
Shapiro-Wilk test results: 
  DSC norm