In [20]:
from User_defined_Data_loader import DataLoader
from scipy.stats import ttest_ind,ttest_1samp,ttest_rel,f_oneway
from scipy.stats import pearsonr,spearmanr,kendalltau
import warnings
warnings.filterwarnings('ignore')

dl = DataLoader("restaurant_data.csv")
restaurant_data = dl.read_data()

In [2]:
from scipy.stats import ttest_ind,ttest_1samp,ttest_rel,f_oneway
from statsmodels.multivariate.manova import MANOVA

In [3]:
restaurant_data.head()

Unnamed: 0,Name,Location,Cuisine,Rating,Seating Capacity,Average Meal Price,Marketing Budget,Social Media Followers,Chef Experience Years,Number of Reviews,Avg Review Length,Ambience Score,Service Quality Score,Parking Availability,Weekend Reservations,Weekday Reservations,Revenue
0,Restaurant 0,Rural,Japanese,4.0,38,73.98,2224,23406,13,185,161.924906,1.3,7.0,Yes,13,4,638945.52
1,Restaurant 1,Downtown,Mexican,3.2,76,28.11,4416,42741,8,533,148.759717,2.6,3.4,Yes,48,6,490207.83
2,Restaurant 2,Rural,Italian,4.7,48,48.29,2796,37285,18,853,56.849189,5.3,6.7,No,27,14,541368.62
3,Restaurant 3,Rural,Italian,4.4,34,51.55,1167,15214,13,82,205.433265,4.6,2.8,Yes,9,17,404556.8
4,Restaurant 4,Downtown,Japanese,4.9,88,75.98,3639,40171,9,78,241.681584,8.6,2.1,No,37,26,1491046.35


In [4]:
restaurant_data = restaurant_data.rename(
    columns={
        'Seating Capacity':'Seating_Capacity',
        'Average Meal Price':'Average_Meal_Price', 
        'Marketing Budget':'Marketing_Budget', 
        'Social Media Followers':'Social_Media_Followers',
        'Chef Experience Years':'Chef_Experience_Years', 
        'Number of Reviews':'Number_of_Reviews', 
        'Avg Review Length': 'AvgReviewLength',
        'Ambience Score':'AmbienceScore', 
        'Service Quality Score':'ServiceQualityScore', 
        'Parking Availability':'ParkingAvailability',
        'Weekend Reservations':'WeekendReservations', 
        'Weekday Reservations':'WeekdayReservations'})

# 1. Parametric Statistical Hypothesis Tests

#### Statistical Test for comaparison between data samples.

## 1.1 One Sample T-Test
mean of a sample is significantly different from a known population mean

#### H0: the mean of sample and population are Same .
#### H1: the mean of sample and population are not Same.

In [5]:
def one_sample_ttest(data,numeric_column):
    sample_data = data[numeric_column].sample(500)
    population_mean = data[numeric_column].mean()
    t_static_value,p_value = ttest_1samp(sample_data,population_mean)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value>0.05:
        print('Propably the mean of sample and population are same')
    else:
        print('Propably the mean of sample and population are not same')

In [6]:
one_sample_ttest(restaurant_data,numeric_column='Revenue')

T-static-Value:0.623, P-Value:0.534
Propably the mean of sample and population are same


## 1.2 Two Sample T-Test
Average Between two data samples are significantly different.

#### H0: the mean between two groups are equal .
#### H1: the mean between two groups are not equal.

In [7]:
def two_sample_ttest(numeric_column,category_column,data):
    groups = data[category_column].unique().tolist()
    group1 = data[data[category_column]==groups[0]][numeric_column].sample(500)
    group2 = data[data[category_column]==groups[1]][numeric_column].sample(500)
    t_static_value, p_value = ttest_ind(group1, group2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print('Probably means between 2 groups are equal')
    else:
        print('Probably means between 2 groups are not equal')

In [8]:
two_sample_ttest(numeric_column='AvgReviewLength',category_column='Location',data=restaurant_data)

T-static-Value:0.815, P-Value:0.415
Probably means between 2 groups are equal


## 1.3 Two sampled Paired t-test
A paired samples t-test is used to test the means between the old and New values of the Features are same or not.

#### H0: mean old and new values are equal.
#### H1: mean old and new values are not equal.

In [9]:
def paired_ttest(numeric_column,category_column,data):
    groups = data[category_column].unique().tolist()
    new = data[data[category_column]==groups[0]][numeric_column].sample(500)
    old = data[data[category_column]==groups[1]][numeric_column].sample(500)
    t_static_value, p_value = ttest_rel(new, old)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print('Probably means of old and new sample data are equal')
    else:
        print('Probably means of old and new sample data are not equal')

In [10]:
paired_ttest(data=restaurant_data,numeric_column='Revenue',category_column='Location')

T-static-Value:-31.598, P-Value:0.000
Probably means of old and new sample data are not equal


## 1.4 Analysis of Variance Test (ANOVA)
test whether there are significant differences between the means of two or more groups.

#### H0: the means between two or more groups are equal .
#### H1: the means between two or more groups are not equal.

In [11]:
def anova_one_way(data,category_column,numeric_column):
    groups = data[category_column].unique().tolist()
    group_data = []
    for i in groups:
        group_data.append(data[data[category_column]==i][numeric_column].sample(500).tolist())
    
    t_static_value, prob_val = f_oneway(*group_data)    
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{prob_val:.3f}')
    if prob_val > 0.05:
        print(f'Probably means of {len(groups)} groups are equal')
    else:
        print(f'Probably means of {len(groups)} groups are not equal')

In [12]:
anova_one_way(restaurant_data,category_column='Location',numeric_column='Revenue')

T-static-Value:472.423, P-Value:0.000
Probably means of 3 groups are not equal


## 1.5 Multivarite Analysis of variance (MANOVA)
Average between two or more paired samples are significantly different.

#### H0: means of two or more groups on multiple independent variables are Equal.
#### H1: means of two or more groups on multiple independent variables are not equal.

In [13]:
def manova_test(numerical_column1,numerical_column2,numerical_column3,categorical_column,data):
    user_formula =f'{numerical_column1} + {numerical_column2} + {numerical_column3} ~ {categorical_column}'
    manova_t = MANOVA.from_formula(formula=user_formula,data=data)
    p_value = manova_t.mv_test().results[categorical_column]['stat']['Pr > F'][0]
    print("P-value:", p_value)
    if p_value > 0.05:
        print('Probably, means of two or more groups on multiple independent variables are Equal')
    else:
        print('Probably, means of two or more groups on multiple independent variables are Not Equal')

In [14]:
result = manova_test(
    numerical_column1='Average_Meal_Price',
    numerical_column2='AvgReviewLength',
    numerical_column3='Revenue',
    categorical_column='Location',
    data=restaurant_data
)

P-value: 0.0
Probably, means of two or more groups on multiple independent variables are Not Equal


# 2. Correlation Tests
Correlation Tests are used to check the correlation between two independent features or variables.

## 2.1 Pearson’s Correlation Coefficient
Tests whether a data features is linearly separable.

#### H0: Fetaures are correlated.
#### H1: Features does not have any correlation.

In [17]:
def pearsons_correlation(data1,data2):
    t_static_value,p_value = pearsonr(data1,data2)
    print(f'T-static-Value:{t_static_value:.3f}, P-Value:{p_value:.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [16]:
pearsons_correlation(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-Value:0.686, P-Value:0.000
Probably Features may not have any correlation


## 2.2 Spearman’s Rank Correlation
Tests whether a data sample is montonically separable.
#### H0: the samples are correlated.
#### H1: the sample does not have any correlation.

In [18]:
def spearmans_correlation(data1,data2):
    t_static_value,p_value = spearmanr(data1,data2)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [19]:
spearmans_correlation(restaurant_data['Average_Meal_Price'],restaurant_data['Revenue'])

T-static-value : 0.680, P-value :0.000
Probably Features may not have any correlation


## 2.3 Kendall’s Rank Correlation
Tests whether a data sample is montonically separable.
#### H0: the samples are correlated.
#### H1: the sample does not have any correlation.

In [32]:
def kendalls_correlation(data1,data2):
    t_static_value,p_value = kendalltau(data1,data2)
    print(f'T-static-value : {t_static_value:.3f}, P-value :{p_value:0.3f}')
    if p_value > 0.05:
        print(f'Probably Features are Correlated')
    else:
        print(f'Probably Features may not have any correlation')

In [33]:
kendalls_correlation(restaurant_data['Average Meal Price'],restaurant_data['Revenue'])

T-static-value : 0.487, P-value :0.000
Probably Features may not have any correlation


## 2.4 Chi-Squared Test
Tests whether two categorical variables are related to each other.
#### H0: the samples are correlated.
#### H1: the sample does not have any correlation.
