In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Hypothesis Testing Exercise 1

## A F&B manager wants to determine whether there is any significant difference in the diameter of the cutlet between two units. A randomly selected sample of cutlets was collected from both units and measured? Analyze the data and draw inferences at 5% significance level. Please state the assumptions and tests that you carried out to check validity of the assumptions.   

### Minitab File : Cutlets.mtw

In [6]:
# getting the data into a dataframe
df = pd.read_csv('Cutlets.csv')

As we will be comparing difference of two independent quantitative samples, we'll be using a two sample t test i.e. scipy.stats.ttest_ind(a, b)
  
This test has the following assumptions:
- Random Sampling (Satisfied. Mentioned in the problem statement)
- Independence (Satisfied. datapoints are from different units)
- Normality
- Homogeneity of Variances

In [11]:
# Test for normality
print(stats.shapiro(df['Unit A']))
print(stats.shapiro(df['Unit B']))
# as the p value is greater than 0.05 for both units, we consider them to have a normal distribution

ShapiroResult(statistic=0.9649459719657898, pvalue=0.31998491287231445)
ShapiroResult(statistic=0.9727305769920349, pvalue=0.5225146412849426)


In [20]:
# Test for Homogeneity of Variances
# This means that variance of both samples are approximately the same. 
# We consider them to be the same if ratio of larger to smaller is less than 4
print(df.var())
df['Unit B'].var()/df['Unit A'].var() < 4
# As ratio of larger to smaller variance is less than 4, this assumption is also met

Unit A    0.083179
Unit B    0.117924
dtype: float64


True

In [25]:
# Defining Null and alternative hypothesis
H0 = 'There is no difference in the diameter of the cutlet of the two units'
H1 = 'There is significant difference in the diameter of the cutlet of the two units'

In [26]:
# significance value
a = 0.05

In [23]:
# t test for independent samples, which returns the statistic and the p value
t_stat, p_value = stats.ttest_ind(df['Unit A'], df['Unit B'])

In [28]:
# if the p_value is smaller than the significance level, we reject the Null Hypothesis
if p_value > a:
    print(H0)
else:
    print(H1)

There is no difference in the diameter of the cutlet of the two units


# Hypothesis Testing Exercise 2

## A hospital wants to determine whether there is any difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list. They collected a random sample and recorded TAT for reports of 4 laboratories. TAT is defined as sample collected to report dispatch. 
## Analyze the data and determine whether there is any difference in average TAT among the different laboratories at 5% significance level.

### Minitab File: LabTAT.mtw

In [30]:
# creating a dataframe from the samples
df = pd.read_csv('LabTAT.csv')

As we want to know if there is any variance between TAT for all labs, we use ANOVA

In [46]:
# defining Null and Alternate Hypothesis
H0 = 'There is no difference in average TAT among the different laboratories'
H1 = 'There is significant difference in average TAT among the different laboratories'

In [35]:
# significance value
a = 0.05

In [38]:
# ANOVA test
f_stat, p_value = stats.f_oneway(df['Laboratory 1'], df['Laboratory 2'], df['Laboratory 3'], df['Laboratory 4'])

In [40]:
# if the p_value is smaller than the significance level, we reject the Null Hypothesis
if p_value > a:
    print(H0)
else:
    print(H1)

There is significant difference in average TAT among the different laboratories


# Hypothesis Testing Exercise 3

## Sales of products in four different regions is tabulated for males and females. Find if male-female buyer rations are similar across regions.

### Buyer Ratio.mtw

In [45]:
# creating a data frame of the contingency table
df = pd.read_csv('BuyerRatio.csv', index_col = 'Observed Values')
df

Unnamed: 0_level_0,East,West,North,South
Observed Values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Males,50,142,131,70
Females,435,1523,1356,750


As we want to know if buyer ratios are same across regions for both sexes, we will use Chi Squared Test for Homogeneity

In [47]:
# Defining Null and Alternate Hypothesis
H0 = 'All proportions are equal'
H1 = 'Not all proportions are equal'

In [48]:
# significance value
a = 0.05

In [52]:
# Chi Squared Test
chi_stat, p_value, dof, expected_freq = stats.chi2_contingency(df)

In [53]:
# if the p_value is smaller than the significance level, we reject the Null Hypothesis
if p_value > a:
    print(H0)
else:
    print(H1)

All proportions are equal


# Hypothesis Testing Exercise 4

##  TeleCall uses 4 centers around the globe to process customer order forms. They audit a certain %  of the customer order forms. Any error in order form renders it defective and has to be reworked before processing.  The manager wants to check whether the defective %  varies by centre. Please analyze the data at 5% significance level and help the manager draw appropriate inferences

### Minitab File: CustomerOrderForm.mtw

In [76]:
# creating a dataframe of the data
df = pd.read_csv('Costomer+OrderForm.csv')

In [77]:
# checking data type, total non-null entries, total entries, total columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Phillippines  300 non-null    object
 1   Indonesia     300 non-null    object
 2   Malta         300 non-null    object
 3   India         300 non-null    object
dtypes: object(4)
memory usage: 9.5+ KB


In [78]:
# checking value counts for each column
print(df.Phillippines.value_counts())
print(df.Indonesia.value_counts())
print(df.Malta.value_counts())
print(df.India.value_counts())

Error Free    271
Defective      29
Name: Phillippines, dtype: int64
Error Free    267
Defective      33
Name: Indonesia, dtype: int64
Error Free    269
Defective      31
Name: Malta, dtype: int64
Error Free    280
Defective      20
Name: India, dtype: int64


In [79]:
df = df.apply(pd.value_counts)
df

Unnamed: 0,Phillippines,Indonesia,Malta,India
Error Free,271,267,269,280
Defective,29,33,31,20


In [80]:
# Defining Null and Alternate Hypothesis
H0 = 'Proportion of defects for each center is the same'
H1 = 'Proportion of defects vary by center'

In [81]:
# Significance value
a = 0.05

In [83]:
chi_stat, p_value, dof, expected_freq = stats.chi2_contingency(df)

In [84]:
# if the p_value is smaller than the significance level, we reject the Null Hypothesis
if p_value > a:
    print(H0)
else:
    print(H1)

Proportion of defects for each center is the same
