In [4]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder

## Loading the dataset

In [5]:
df = pd.read_csv('data/drug_safety.csv')

In [6]:
df.head()

Unnamed: 0,age,sex,trx,week,wbc,rbc,adverse_effects,num_effects
0,62,male,Drug,0,7.3,5.1,No,0
1,62,male,Drug,1,,,No,0
2,62,male,Drug,12,5.6,5.0,No,0
3,62,male,Drug,16,,,No,0
4,62,male,Drug,2,6.6,5.1,No,0


## Remove nan values

In [7]:
df.isna().sum()

age                   0
sex                   0
trx                   0
week                  0
wbc                6975
rbc                6976
adverse_effects       0
num_effects           0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
len(df)

9127

In [10]:
df.isna().sum()

age                0
sex                0
trx                0
week               0
wbc                0
rbc                0
adverse_effects    0
num_effects        0
dtype: int64

##  basic statistics for numeric columns.

In [11]:
df.describe()

Unnamed: 0,age,week,wbc,rbc,num_effects
count,9127.0,9127.0,9127.0,9127.0,9127.0
mean,64.00767,4.985428,7.340331,4.672784,0.1008
std,8.847711,4.375397,1.996645,0.45852,0.322178
min,39.0,0.0,1.8,2.1,0.0
25%,58.0,2.0,6.0,4.4,0.0
50%,65.0,4.0,7.1,4.7,0.0
75%,71.0,8.0,8.4,5.0,0.0
max,84.0,20.0,26.5,7.6,3.0


## Group the dataset by trx and summarize key statistics

In [12]:
df.groupby('trx')['wbc'].describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
trx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Drug,6011.0,7.33,2.01,1.8,6.0,7.0,8.4,26.5
Placebo,3116.0,7.36,1.97,3.0,6.0,7.15,8.4,23.8


In [13]:
df.groupby('trx')['rbc'].describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
trx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Drug,6011.0,4.68,0.45,2.7,4.4,4.7,5.0,7.5
Placebo,3116.0,4.66,0.47,2.1,4.4,4.7,5.0,7.6


In [14]:
df.groupby('trx')['num_effects'].describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
trx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Drug,6011.0,0.1,0.33,0.0,0.0,0.0,0.0,3.0
Placebo,3116.0,0.1,0.32,0.0,0.0,0.0,0.0,3.0


## Convert adverse_effects column to numeric column

In [15]:
label_encoder = LabelEncoder()
#No : 0
#Yes : 1
df['adverse_effects'] = label_encoder.fit_transform(df['adverse_effects'])

In [16]:
df.head()

Unnamed: 0,age,sex,trx,week,wbc,rbc,adverse_effects,num_effects
0,62,male,Drug,0,7.3,5.1,0,0
2,62,male,Drug,12,5.6,5.0,0,0
4,62,male,Drug,2,6.6,5.1,0,0
6,62,male,Drug,4,6.9,5.2,1,1
7,62,male,Drug,8,7.1,5.0,1,1


## Checking mean of wbc, rbc, num_effects and num_effects for each group

In [17]:
grouped_stats = df.groupby("trx")[["wbc", "rbc", "num_effects", "adverse_effects"]].agg(["mean"]).round(3)
grouped_stats


Unnamed: 0_level_0,wbc,rbc,num_effects,adverse_effects
Unnamed: 0_level_1,mean,mean,mean,mean
trx,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Drug,7.33,4.679,0.102,0.096
Placebo,7.359,4.661,0.098,0.092


## Hypotheses test

In [19]:
def hypothesis_test(df):
    drug_group = df[df["trx"] == "Drug"]['rbc']
    placebo_group = df[df["trx"] == "Placebo"]['rbc']

    mean_drug_group = drug_group.mean()
    std_drug_group = drug_group.std()
    mean_placebo_group = placebo_group.mean()
    std_placebo_group = placebo_group.std()

    print(f"Drug Group - Mean: {mean_drug_group:.2f}, Std Dev: {std_drug_group:.2f}")
    print(f"Placebo Group - Mean: {mean_placebo_group:.2f}, Std Dev: {std_placebo_group:.2f}")

    t_stat, p_value = stats.ttest_ind(drug_group, placebo_group, alternative='two-sided', equal_var=True)

    print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

    alpha = 0.05
    print("significant level : 0.05")
    if p_value < alpha:
        print("We reject the null hypothesis: There is a significant difference between two groups.")
    else:
        print("We fail to reject the null hypothesis: There is no significant difference between two groups.")

    alpha = 0.1
    print("significant level : 0.1")
    if p_value < alpha:
        print("We reject the null hypothesis: There is a significant difference between two groups.")
    else:
        print("We fail to reject the null hypothesis: There is no significant difference between two groups.")


hypothesis_test(df)

Drug Group - Mean: 4.68, Std Dev: 0.45
Placebo Group - Mean: 4.66, Std Dev: 0.47
T-statistic: 1.8367, P-value: 0.0663
significant level : 0.05
We fail to reject the null hypothesis: There is no significant difference between two groups.
significant level : 0.1
We reject the null hypothesis: There is a significant difference between two groups.


## Alternative and equal_var argument

### The alternative Argument
The alternative argument specifies the alternative hypothesis. It can take three values:

`two-sided` : Tests if the means of the two groups are significantly different from each other. In other words, it checks if the means of the two groups are different, but it doesn't specify which group is higher or lower. This is the most common approach when we don't have a specific direction in mind.

`less` : Tests if the mean of the first group is significantly less than the mean of the second group.

`greater` : Tests if the mean of the first group is significantly greater than the mean of the second group.

We chose `two-sided` because we want to test for any significant difference between the Drug and Placebo groups, regardless of direction.

### The equal_var Argument
The equal_var argument specifies whether to assume that the two populations have equal variances. It can take two values:

`True`: Assumes that the variances of the two groups are equal.

`False`: Does not assume that the variances of the two groups are equal.

We used `True` because both groups have the same standard deviation.