# Import Library

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as st

# Read Dataset

In [None]:
df = pd.read_csv('e-commerce_example_dataset.csv')

In [None]:
df.shape

(100, 3)

In [None]:
df['discount'].value_counts()

discount        65
not-discount    35
Name: discount, dtype: int64

In [None]:
df.head()

Unnamed: 0,gmv,discount,user_id
0,1324911,not-discount,1
1,1625874,not-discount,2
2,1432984,not-discount,3
3,1294427,not-discount,4
4,1635693,not-discount,5


In [None]:
df.tail()

Unnamed: 0,gmv,discount,user_id
95,1477384,discount,96
96,1684541,discount,97
97,1087507,discount,98
98,1958701,discount,99
99,1803114,discount,100


# Langkah 1

Apakah rata rata gmv discount berbeda signifikan gmv non-discount

H0 : rata-rata gmv discount **sama dengan** gmv non-discount <br>
H1 : rata-rata gmv discount **tidak sama dengan** gmv non-discount

In [None]:
?st.ttest_ind

In [None]:
discount = df[df['discount']=='discount']
not_discount = df[df['discount']!='discount']

In [None]:
print(discount.shape)
discount.head()

(65, 3)


Unnamed: 0,gmv,discount,user_id
35,1915181,discount,36
36,1481660,discount,37
37,1517240,discount,38
38,1705569,discount,39
39,1571139,discount,40


In [None]:
print(not_discount.shape)
not_discount.head()

(35, 3)


Unnamed: 0,gmv,discount,user_id
0,1324911,not-discount,1
1,1625874,not-discount,2
2,1432984,not-discount,3
3,1294427,not-discount,4
4,1635693,not-discount,5


# Langkah 2

In [None]:
discount['gmv'].mean()

1624246.553846154

In [None]:
not_discount['gmv'].mean()

1400420.4

In [None]:
print('selisih gmv:',int(discount['gmv'].mean()-not_discount['gmv'].mean()))

selisih gmv: 223826


# Langkah 3

In [None]:
ttest = st.ttest_ind(a = discount['gmv'], b = not_discount['gmv'], equal_var=False)

In [None]:
ttest.pvalue

3.398180189931703e-05

# Langkah 4

In [None]:
p_value = ttest.pvalue
print('P-Value :',p_value)
if p_value >= 0.05:
    print('Tidak cukup bukti menyatakan bahwa rata-rata gmv discount berbeda signifikan dari gmv non-discount')
else:
    print('Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv discount berbeda signifikan dengan gmv non-discount')

P-Value : 3.398180189931703e-05
Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv discount berbeda signifikan dengan gmv non-discount


# Case 2 ANOVA

In [None]:
df2 = pd.read_csv('Iris.csv')

df_versicolor = df2[df2['Species'] == 'Iris-versicolor']
df_setosa = df2[df2['Species'] == 'Iris-setosa']
df_virginica = df2[df2['Species'] == 'Iris-virginica']


In [None]:
anova_test = st.f_oneway(df_versicolor['SepalWidthCm'], # Pengujian untuk anova
                         df_setosa['SepalWidthCm'],
                         df_virginica['SepalWidthCm'])

In [None]:
anova_test.pvalue

1.3279165184572242e-16

In [None]:
if anova_test.pvalue>0.05:
    print('Species memiliki rata-rata SepalWidthCm yang sama')
else:
    print('Species memiliki rata-rata SepalWidthCm yang berbeda signifikan')

Species memiliki rata-rata SepalWidthCm yang berbeda signifikan


In [None]:
df2.groupby(['Species'])['SepalWidthCm'].mean()

Species
Iris-setosa        3.418
Iris-versicolor    2.770
Iris-virginica     2.974
Name: SepalWidthCm, dtype: float64

In [None]:
# 1. T-Test -- Optional : MannWhitneyU Test
# 2. ANOVA
# 3. Proportion Z-Test
# 4. Chi-Square Test 