In [1]:
from User_defined_Data_loader import DataLoader
from scipy.stats import ttest_ind,ttest_1samp,ttest_rel,f_oneway
from scipy.stats import pearsonr,spearmanr,kendalltau
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

dl = DataLoader("restaurant_data.csv")
restaurant_data = dl.read_data()

In [2]:
from scipy.stats import ttest_ind,ttest_1samp,ttest_rel,f_oneway
from statsmodels.multivariate.manova import MANOVA

In [3]:
restaurant_data.head()

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,Restaurant 0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Restaurant 1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Restaurant 2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Restaurant 3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.8
4,Restaurant 4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35


In [4]:
restaurant_data = restaurant_data.rename(
    columns={
        'Seating Capacity':'Seating_Capacity',
        'Average Meal Price':'Average_Meal_Price', 
        'Marketing Budget':'Marketing_Budget', 
        'Social Media Followers':'Social_Media_Followers',
        'Chef Experience Years':'Chef_Experience_Years', 
        'Number of Reviews':'Number_of_Reviews', 
        'Avg Review Length': 'AvgReviewLength',
        'Ambience Score':'AmbienceScore', 
        'Service Quality Score':'ServiceQualityScore', 
        'Parking Availability':'ParkingAvailability',
        'Weekend Reservations':'WeekendReservations', 
        'Weekday Reservations':'WeekdayReservations'})

In [5]:
dl = DataLoader('AirQualityUCI.csv',)
air_quality_data = dl.read_data()
air_quality_data.set_index('Date',inplace=True)
air_quality_data['PT08.S5(O3)'].fillna(air_quality_data['PT08.S5(O3)'].mean(),inplace=True)

# 1. Parametric Statistical Hypothesis Tests
Statistical Test for comaparison between data samples.

## 1.1 One Sample T-Test
mean of a sample is significantly different from a known population mean

#### H0: the mean of sample and population are Same .
#### H1: the mean of sample and population are not Same.

In [6]:
def one_sample_ttest(data,numeric_column):
    sample_data = data[numeric_column].sample(500,random_state=1)
    population_mean = data[numeric_column].mean()
    t_static_value,p_value = ttest_1samp(sample_data,population_mean)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value>0.05:
        print('Propably the mean of sample and population are same')
    else:
        print('Propably the mean of sample and population are not same')

In [7]:
one_sample_ttest(restaurant_data,numeric_column='Revenue')

T-static-Value:-1.464, P-Value:0.144
Propably the mean of sample and population are same


## 1.2 Two Sample T-Test
Average Between two data samples are significantly different.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations are normally distributed.
3. Observations have same variance between each other.
### Hypothesis
__H0: the mean between two groups are equal.__

__H1: the mean between two groups are not equal.__

In [8]:
def two_sample_ttest(numeric_column,category_column,data):
    groups = data[category_column].unique().tolist()
    group1 = data[data[category_column]==groups[0]][numeric_column].sample(500,random_state=1)
    group2 = data[data[category_column]==groups[1]][numeric_column].sample(500,random_state=1)
    t_static_value, p_value = ttest_ind(group1, group2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print('Probably means between 2 groups are equal')
    else:
        print('Probably means between 2 groups are not equal')

In [9]:
two_sample_ttest(numeric_column='AvgReviewLength',category_column='Location',data=restaurant_data)

T-static-Value:0.105, P-Value:0.917
Probably means between 2 groups are equal


## 1.3 Two sampled Paired t-test
A paired samples t-test is used to test the means between the old and New values of the Features are same or not.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations are normally distributed.
3. Observations have same variance between each other.
4. Observations are paired.
### Hypothesis
__H0: mean old and new values are equal.__

__H1: mean old and new values are not equal.__

In [10]:
def paired_ttest(numeric_column,category_column,data):
    groups = data[category_column].unique().tolist()
    new = data[data[category_column]==groups[0]][numeric_column].sample(500,random_state=1)
    old = data[data[category_column]==groups[1]][numeric_column].sample(500,random_state=1)
    t_static_value, p_value = ttest_rel(new, old)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print('Probably means of old and new sample data are equal')
    else:
        print('Probably means of old and new sample data are not equal')

In [11]:
paired_ttest(data=restaurant_data,numeric_column='Revenue',category_column='Location')

T-static-Value:-31.193, P-Value:0.000
Probably means of old and new sample data are not equal


## 1.4 Analysis of Variance Test (ANOVA)
test whether there are significant differences between the means of two or more groups.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations are normally distributed.
3. Observations have same variance between each other.
### Hypothesis
__H0: the means between two or more groups are equal.__

__H1: the means between two or more groups are not equal.__

In [12]:
def anova_one_way(data,category_column,numeric_column):
    groups = data[category_column].unique().tolist()
    group_data = []
    for i in groups:
        group_data.append(data[data[category_column]==i][numeric_column].sample(500,random_state=1).tolist())
    
    t_static_value, prob_val = f_oneway(*group_data)    
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{prob_val:.3f}')
    if prob_val > 0.05:
        print(f'Probably means of {len(groups)} groups are equal')
    else:
        print(f'Probably means of {len(groups)} groups are not equal')

In [13]:
anova_one_way(restaurant_data,category_column='Location',numeric_column='Revenue')

T-static-Value:512.905, P-Value:0.000
Probably means of 3 groups are not equal


## 1.5 Multivarite Analysis of variance (MANOVA)
Average between two or more paired samples are significantly different.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations are normally distributed.
3. Observations have same variance between each other.
4. Observation can be paired.
### Hypothesis
__H0: means of two or more groups on multiple independent variables are Equal.__

__H1: means of two or more groups on multiple independent variables are not equal.__

In [14]:
def manova_test(numerical_column1,numerical_column2,numerical_column3,categorical_column,data):
    user_formula =f'{numerical_column1} + {numerical_column2} + {numerical_column3} ~ {categorical_column}'
    manova_t = MANOVA.from_formula(formula=user_formula,data=data)
    p_value = manova_t.mv_test().results[categorical_column]['stat']['Pr > F'][0]
    print(f"P-value: {p_value:.3f}")
    if p_value > 0.05:
        print('Probably, means of two or more groups on multiple independent variables are Equal')
    else:
        print('Probably, means of two or more groups on multiple independent variables are Not Equal')

In [15]:
result = manova_test(
    numerical_column1='Average_Meal_Price',
    numerical_column2='AvgReviewLength',
    numerical_column3='Revenue',
    categorical_column='Location',
    data=restaurant_data
)

P-value: 0.000
Probably, means of two or more groups on multiple independent variables are Not Equal


# 2. Nonparametric Statistical Hypothesis Tests

## 2.1 Mann-Whitney U Test
Distribution of two data samples are equal or not. And, used for independent samples
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations in each data samples can be ranked.
### Hypothesis
__H0: the distribution of two samples are equal.__

__H1: the distribution of two samples are not equal.__

In [16]:
from scipy.stats import mannwhitneyu

def mann_whitney_test(data1,data2):
    t_static_value, p_value= mannwhitneyu(data1, data2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Distributions of two samples are equal')
    else:
        print(f'Distributions of two samples are not equal')

In [17]:
mann_whitney_test(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-Value:0.000, P-Value:0.000
Distributions of two samples are not equal


## 2.2 Wilcoxon Signed-Rank Test
Distribution between two paired samples are significantly equal or not. And, used for related samples
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations can be ranked.
3. Observations are paired.
### Hypothesis
__H0: the distribution of two samples are equal.__

__H1: the distribution of two samples are not equal.__

In [18]:
# Example of the Wilcoxon Signed-Rank Test
from scipy.stats import wilcoxon

def wilcoxon_test(numerical_column,category_column,data):
    groups = data[category_column].unique().tolist()
    group1 = data[data[category_column]==groups[0]][numerical_column].sample(500,random_state=1)
    group2 = data[data[category_column]==groups[1]][numerical_column].sample(500,random_state=1)
    t_static_value, p_value= wilcoxon(group1, group2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Distributions of two paired samples are equal')
    else:
        print(f'Distributions of two paired samples are not equal')

In [19]:
wilcoxon_test(numerical_column='Revenue',category_column='Location',data=restaurant_data)

T-static-Value:2455.000, P-Value:0.000
Distributions of two paired samples are not equal


## 2.3 Kruskal-Wallis H Test
Distribution between two or more independent samples are significantly equal or not. And, used for independent samples.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations can be ranked.
### Hypothesis
__H0: the distribution of samples are equal.__

__H1: the distribution of samples are not equal.__

In [20]:
from scipy.stats import kruskal

def kruskal_test(data,category_column,numeric_column):
    groups = data[category_column].unique().tolist()
    group_data = []
    for i in groups:
        group_data.append(data[data[category_column]==i][numeric_column].sample(500,random_state=1).tolist())
    
    t_static_value, p_value = kruskal(*group_data)    
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Distributions of two or more groups samples are equal')
    else:
        print(f'Distributions of two or more groups samples are not equal')

In [21]:
kruskal_test(data=restaurant_data,category_column='Location',numeric_column='AvgReviewLength')

T-static-Value:0.798, P-Value:0.671
Distributions of two or more groups samples are equal


## 2.4 Friedman Test
Distribution between two or more paired samples are significantly equal or not.
### Assumption
1. Each data sample's observation are independent and distributed.
2. Observations can be ranked.
3. Observations can be paired.
### Hypothesis
__H0: the distribution of all samples are equal.__

__H1: the distribution of one or more samples are not equal.__

In [22]:
from scipy.stats import friedmanchisquare

In [23]:
def friedman_test(data1,data2,data3):
    t_static_value, p_value= friedmanchisquare(data1, data2, data3)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Distributions of two or more independent samples are equal')
    else:
        print(f'Distributions of two or more independent samples are not equal')

In [24]:
friedman_test(restaurant_data['Average_Meal_Price'],restaurant_data['AvgReviewLength'],restaurant_data['Revenue'])

T-static-Value:16395.236, P-Value:0.000
Distributions of two or more independent samples are not equal


# 3. Correlation Tests
Correlation Tests are used to check the correlation between two independent features or variables.

## 3.1 Pearson’s Correlation Coefficient
Tests whether a data features is linearly separable.
### Assumption
1. Observations in each sample are independent and distributed identically.
2. Observations are normally distributed.
3. Similar variance between independent variables
### Hypothesis
__H0: Fetaures are correlated.__

__H1: Features does not have any correlation.__

In [25]:
def pearsons_correlation(data1,data2):
    t_static_value,p_value = pearsonr(data1,data2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [26]:
pearsons_correlation(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-Value:0.686, P-Value:0.000
Probably Features may not have any correlation


## 3.2 Spearman’s Rank Correlation
Tests whether a data sample is montonically separable.
### Assumption
1. Observations in each sample are independent and distributed identically.
2. Observations in each sample are ranked .
### Hypothesis
__H0: the samples are correlated.__

__H1: the sample does not have any correlation.__

In [27]:
def spearmans_correlation(data1,data2):
    t_static_value,p_value = spearmanr(data1,data2)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [28]:
spearmans_correlation(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-value : 0.680, P-value :0.000
Probably Features may not have any correlation


## 3.3 Kendall’s Rank Correlation
Tests whether a data sample is montonically separable.
### Assumption
1. Observations in each sample are independent and distributed identically.
2. Observations in each sample are ranked .
### Hypothesis
__H0: the samples are correlated.__

__H1: the sample does not have any correlation.__

In [29]:
def kendalls_correlation(data1,data2):
    t_static_value,p_value = kendalltau(data1,data2)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [30]:
kendalls_correlation(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-value : 0.487, P-value :0.000
Probably Features may not have any correlation


## 3.4 Chi-Squared Test
Tests whether two categorical variables are related to each other.
### Assumption
1. Observations in used in contengency table are Independent.
2. There are more than 25 examples in contengency table .
### Hypothesis
__H0: probably two categorical features are correlated.__

__H1: probably two categorical features may does not have any correlation.__

In [31]:
from scipy.stats import chi2_contingency
def chi2_test(category_data1,category_data2):
    cross_table = pd.crosstab(category_data1,category_data2)
    chi,p_value,dof,expected=chi2_contingency(cross_table)
    print(f'Chi-Square-value : {chi:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [32]:
chi2_test(restaurant_data['Location'],restaurant_data['Cuisine'])

Chi-Square-value : 15.593, P-value :0.112
Probably Features are Correlated


# 4. Normality Tests
Main obejctive of performing Normality Tests is to validate the Normal distribution of data.
## 4.1 Shapiro-Wilk Test
Tests whether a data sample has a Normal distribution.
### Assumption
Observations in each sample are independent and distributed identically.
### Hypothesis
__H0: the sample has a Normal distribution.__

__H1: the sample does not have a Normal distribution.__

In [33]:
from scipy.stats import shapiro

def shapiro_wilk_test(numeric_data):
    t_static_value, p_value = shapiro(numeric_data)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print('Probably Normal Distribution')
    else:
        print('Probably not a Normal Distribution')

In [34]:
shapiro_wilk_test(restaurant_data['Revenue'])

T-static-value : 0.953, P-value :0.000
Probably not a Normal Distribution


## 4.2 D'Agostino-Pearson Test
tests the skewness and kurtosis of the data to test for normality.
### Assumption
Observations in each sample are independent and distributed identically.
### Hypothesis
__H0: the sample has a Normal distribution.__

__H1: the sample does not have a Normal distribution.__

In [35]:
from scipy.stats import normaltest

def d_agostino_test(data):
    t_static_value,p_value = normaltest(data)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print('Probably Normal Distribution')
    else:
        print('Probably not a Normal Distribution')

In [36]:
d_agostino_test(restaurant_data['AvgReviewLength'])

T-static-value : 7096.180, P-value :0.000
Probably not a Normal Distribution


## 4.3 Anderson-Darling Test
Tests whether a data sample has a Normal distribution.
### Assumption
Observations in each sample are independent and distributed identically.
### Hypothesis
__H0: the sample has a Normal distribution.__

__H1: the sample does not have a Normal distribution.__

In [37]:
from scipy.stats import anderson

def anderson_darling_test(data):
    result = anderson(data,dist='norm')
    
    # get the critical value for the chosen significance level 5%
    critical_value = result.critical_values[result.significance_level == 5]
    print(f'T-static-value : {result.statistic:.3f}, Critical-Value :{critical_value}')
    # if the test statistic is greater than the critical value, reject the null hypothesis
    if result.statistic > critical_value:
        print("The data is not normally distributed.")
    else:
        print("The data is normally distributed.")

In [38]:
anderson_darling_test(restaurant_data['Average_Meal_Price'])

T-static-value : 154.289, Critical-Value :[0.787]
The data is not normally distributed.


## 4.4 Kolmogorov-Smirnov Test
This test compares the empirical distribution function of the data to a specified theoretical distribution (in this case, a normal distribution)
### Assumptions
Observations in each sample are independent and distributed identically.
### Hypothesis
__H0: the sample has a Normal distribution.__

__H1: the sample does not have a Normal distribution.__

In [39]:
from scipy.stats import kstest

def kolmogorov_test(data):
    t_static_value,p_value = kstest(data,'norm')
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print('Probably Normal Distribution')
    else:
        print('Probably not a Normal Distribution')

In [40]:
kolmogorov_test(restaurant_data['Average_Meal_Price'])

T-static-value : 1.000, P-value :0.000
Probably not a Normal Distribution


# 5. Stationary Tests
Used for Validating the Time series data trends(Stationary/Not-Stationary).

## 5.1 Augmented Dickey-Fuller Unit Root Test
Tests whether a Time series data has autoregressive trend.
### Assumption
Data Instance have temporality.
### Hypothesis
__H0: the unit root is present.__

__H1: the unit root not present.__

In [41]:
# Example of the Augmented Dickey-Fuller unit root test
from statsmodels.tsa.stattools import adfuller

def augmented_DFU_roottest(data):
    t_static_value, p_value, lags, obs, crit, t = adfuller(data)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print('Probably not Stationary')
    else:
        print('Probably Stationary')

In [42]:
augmented_DFU_roottest(air_quality_data['PT08.S5(O3)'])

T-static-value : -11.458, P-value :0.000
Probably Stationary


## Kwiatkowski-Phillips-Schmidt-Shin
Tests whether a Time series trend is stationary or not.
### Assumption
Data Instance have temporality.
### Hypothesis
__H0: the stationarity is present.__

__H1: the stationarity not present.__

In [47]:
from statsmodels.tsa.stattools import kpss

def kwiatkowski_test(data):
    t_static_value, p_value, lags, obs = kpss(data,regression='c')
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print('Probably not Stationary')
    else:
        print('Probably Stationary')

In [48]:
kwiatkowski_test(air_quality_data['PT08.S5(O3)'])

T-static-value : 0.345, P-value :0.100
Probably not Stationary


look-up table. The actual p-value is greater than the p-value returned.

  t_static_value, p_value, lags, obs = kpss(data,regression='c')
