# independent 2 sample T-test

In [None]:
import pandas as pd
from scipy import stats

# Read CSV file into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Assuming your CSV has columns 'sample' and 'values'
sample1 = data[data['sample'] == 'Sample 1']['values']
sample2 = data[data['sample'] == 'Sample 2']['values']

# Perform the independent t-test
t_statistic, p_value = stats.ttest_ind(sample1, sample2)

# Print the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Check if the p-value is less than the significance level (e.g., 0.05)
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference between the groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the groups.")


# paired 2 sample T-test

In [None]:
import pandas as pd
from scipy import stats

# Read CSV file into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Assuming your CSV has columns 'before' and 'after'
before = data['before']
after = data['after']

# Perform the paired t-test
t_statistic, p_value = stats.ttest_rel(before, after)

# Print the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Check if the p-value is less than the significance level (e.g., 0.05)
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference between the paired samples.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the paired samples.")


# one-way ANOVA

In [2]:
#example using wine.csv

import pandas as pd
from scipy import stats

# Read CSV file into a pandas DataFrame
data = pd.read_csv('wine.csv')

# Assuming your CSV has columns 'group' and 'values'
groups = data['Wine']
values = data['Alcohol']

# Create separate data arrays for each group
group_dict = {}
for group_id, value in zip(groups, values):
    if group_id not in group_dict:
        group_dict[group_id] = []
    group_dict[group_id].append(value)

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*group_dict.values())

# Print the results
print("F-Statistic:", f_statistic)
print("P-Value:", p_value)

# Check if the p-value is less than the significance level (e.g., 0.05)
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference among the groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference among the groups.")


F-Statistic: 135.07762424279912
P-Value: 3.319503795619655e-36
Reject the null hypothesis: There is a significant difference among the groups.


# two way ANOVA

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Read CSV file into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Create an ordinary least squares (OLS) model
model = ols('score ~ C(group) + C(gender) + C(group):C(gender)', data=data).fit()

# Perform two-way ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)

# Print the ANOVA table
print(anova_table)


# MANCOVA

In [None]:
#this is alot like two way anova but taking it a step further.

import pandas as pd
import numpy as np
from statsmodels.multivariate.manova import MANOVA

# Read data into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Define independent variable, method A,B,C
teaching_method = data['Teaching Method']

# Define dependent variables
dependent_vars = data[['Math', 'Science', 'English']]

# Perform MANOVA
manova = MANOVA(dependent_vars, teaching_method)

# Print the results
print(manova.mv_test())


# ANCOVA

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

# Read CSV file into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Define independent variable (categorical) and covariate (continuous)
group = data['group']
covariate = data['age']

# Define dependent variables (pretest and posttest)
pretest = data['pretest']
posttest = data['posttest']

# Fit the ANCOVA model
covariate = sm.add_constant(covariate)  # Add a constant term for the intercept
model = sm.OLS(posttest, pretest).fit(cov_type='HC3', cov_kwds={'groups': group, 'time': 'group'})

# Perform the ANCOVA
anova_table = sm.stats.anova_lm(model, typ=2)

# Print the ANCOVA table
print(anova_table)


## uses 'counter' to test for balance/imbalance in data if ratio >2

In [None]:
import pandas as pd
from collections import Counter

# Read the CSV file into a DataFrame
file_path = 'creditcard.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Assuming you have a binary target column named 'target_column_name'
target_column_name = 'Class'  # Replace with your actual target column name

# Check the class distribution
class_distribution = Counter(df[target_column_name])

# Print the class distribution
print("Class Distribution:")
print(class_distribution)

# Calculate the imbalance ratio
if len(class_distribution) == 2:
    minority_class = min(class_distribution, key=class_distribution.get)
    majority_class = max(class_distribution, key=class_distribution.get)
    minority_class_count = class_distribution[minority_class]
    majority_class_count = class_distribution[majority_class]

    imbalance_ratio = majority_class_count / minority_class_count

    print(f"Imbalance Ratio: {imbalance_ratio:.2f}")
    if imbalance_ratio > 2:  # You can adjust this threshold based on your problem
        print("The dataset is imbalanced.")
    else:
        print("The dataset is not significantly imbalanced.")
else:
    print("The dataset should have exactly two classes for binary classification.")

# Optionally, you can visualize the class distribution or take further actions to handle imbalance.


## Synthetic minority oversampling technique SMOTE then RFC

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Read the CSV file into a DataFrame
file_path = 'your_dataset.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Assuming you have a target column named 'target_column_name'
target_column_name = 'target'  # Replace with your actual target column name

# Separate features (X) and the target (y)
X = df.drop(columns=[target_column_name])
y = df[target_column_name]

# Apply SMOTE to balance the data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Create and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)


### Anderson-Darling

In [None]:
import pandas as pd
from scipy.stats import anderson

# Read in the CSV file (replace 'your_data.csv' with the actual file path)
data = pd.read_csv('ADNI1_3T_ROI.csv')

# Loop through columns and perform Anderson-Darling test
for column in data.columns:
    sample_data = data[column]
    result = anderson(sample_data)   # or result=anderson(sample_data, dist='choose') {‘norm’, ‘expon’, ‘logistic’, ‘gumbel’, ‘gumbel_l’, ‘gumbel_r’, ‘extreme1’, ‘weibull_min’}
    
    # Extract the test statistic and critical values
    test_statistic = result.statistic
    critical_values = result.critical_values

    # Interpret the results
    print(f'Anderson-Darling Test for "{column}"')
    print(f'Test Statistic: {test_statistic}')
    print('Critical Values:')
    for i in range(len(critical_values)):
        significance_level = [15, 10, 5, 2.5, 1][i]
        is_significant = test_statistic > critical_values[i]
        print(f'Significance Level {significance_level}%: Test {"significant" if is_significant else "not significant"}')
    print()