In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import scipy.stats as stats # Many functions for probability distributions in python are in scipy.stats
import os
from scipy.stats import variation 
from scipy.stats import ttest_rel
import statistics as s
from statsmodels.formula.api import ols      # For n-way ANOVA
from statsmodels.stats.anova import _get_covariance,anova_lm


In [None]:
df=pd.read_csv('bank_marketing_part1_Data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:

df.dtypes

In [None]:
df.describe(include='all')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:

print(df.nunique())

In [None]:
# Are there any duplicates?
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
df[dups]

In [None]:
def univariateAnalysis_numeric(column,nbins):
   
    plt.figure()
    print("Distribution of " + column)
    print("----------------------------------------------------------------------------")
    sns.distplot(df[column], kde=False, color='g');
    plt.show()
    
    plt.figure()
    print("BoxPlot of " + column)
    print("----------------------------------------------------------------------------")
    ax = sns.boxplot(x=df[column])
    plt.show()

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
lstnumericcolumns = list(df_num.columns.values)
len(lstnumericcolumns)

In [None]:
for x in lstnumericcolumns:
    univariateAnalysis_numeric(x,10)

In [None]:
corr = df_num.corr(method='pearson')

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool)) 
fig = plt.subplots(figsize=(25, 15))
sns.heatmap(df_num.corr(), annot=True,fmt='.2f',mask=mask)
plt.show()

In [None]:
skewValue = df.skew(axis=0)
print("Skew:")

print(skewValue)

### Removing outliers in the probability_of_full_payment varibale.

In [None]:
Q1 = np.percentile(df['probability_of_full_payment'], 25, 
                   interpolation = 'midpoint') 
  
Q3 = np.percentile(df['probability_of_full_payment'], 75,
                   interpolation = 'midpoint') 
IQR = Q3 - Q1 
  


In [None]:
# Upper bound
upper = np.where(df['probability_of_full_payment'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['probability_of_full_payment'] <= (Q1-1.5*IQR))
  
''' Removing the Outliers '''
df.drop(upper[0], inplace = True)
df.drop(lower[0], inplace = True)
  
print("New Shape: ", df.shape)

### Removing outliers in the min_payment_amt varibale.

In [None]:
Q1 = np.percentile(df['min_payment_amt'], 25, 
                   interpolation = 'midpoint') 
  
Q3 = np.percentile(df['min_payment_amt'], 75,
                   interpolation = 'midpoint') 
IQR = Q3 - Q1 
  

In [None]:
# Upper bound
upper = np.where(df['min_payment_amt'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['min_payment_amt'] <= (Q1-1.5*IQR))
  
''' Removing the Outliers '''
df.drop(upper[0], inplace = True)
df.drop(lower[0], inplace = True)
  
print("New Shape: ", df.shape)

In [None]:
print(df['Occupation'].value_counts())

In [None]:
df.Occupation = pd.Categorical(df.Occupation)

In [None]:
df.Education = pd.Categorical(df.Education)

## How ANOVA works?
Check sample sizes: equal number of observation in each group

Calculate Mean Square for each group (MS) (SS of group/level-1); level-1 is a degree of freedom (df) for a group

Calculate Mean Square error (MSE) (SS error/df of residuals)

Calculate F value (MS of group/MSE)

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols


## The Hypothesis for the One Way ANOVA are(Education):
## $H_0$: The mean salary of the employees are the same at 3 levels of Education

## $H_a$: For at least one level of educations,the mean salary is different.¶

    


In [None]:
formula = 'Salary ~  C(Education)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
print(aov_table)

We see that the corresponding p-value is greater than alpha (0.05). Thus, we  𝐟𝐚𝐢𝐥 𝐭𝐨 𝐫𝐞𝐣𝐞𝐜𝐭  the  𝐍𝐮𝐥𝐥 𝐇𝐲𝐩𝐨𝐭𝐡𝐞𝐬𝐢𝐬  ( 𝐻0 ).

## The Hypothesis for the One Way ANOVA are(Occupation):
## $H_0$: The mean salary of the employees are the same at 3 levels of Occupations

## $H_a$: For at least one level of Occupation,the mean salary is different.¶

In [None]:
formula = 'Salary ~  C(Occupation)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
print(aov_table)

We see that the corresponding p-value is greater than alpha (0.05). Thus, we  𝐟𝐚𝐢𝐥 𝐭𝐨 𝐫𝐞𝐣𝐞𝐜𝐭  the  𝐍𝐮𝐥𝐥 𝐇𝐲𝐩𝐨𝐭𝐡𝐞𝐬𝐢𝐬  ( 𝐻0 ).

# There are three sets of hypothesis with the two-way ANOVA.

## The null hypothesis for each of the sets are:

The population mean salary with respect to Education is equal. 

The population mean salary with respect to Occupation is equal. 

There is no interaction between the two factors.
## The alternate hypothesis for each of the sets are:
For at least one level of educations,the mean salary is different.

For at least one level of Occupation,the mean salary is different


There is interaction between the two factors.

In [None]:
formula = 'Salary ~ C(Occupation) + C(Education)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
sns.pointplot(x='Education', y='Salary', data=df, hue='Occupation',ci=None);

### Still, we can see that there is some sort of interaction between the two treatments. So, we will introduce a new term while performing the Two Way ANOVA

In [None]:
formula = 'Salary ~ C(Education) + C(Occupation) + C(Education):C(Occupation)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
(aov_table)

### Due to the inclusion of the interaction effect term, we can see a slight change in the p-value of the first two treatments as compared to the Two-Way ANOVA without the interaction effect terms.And we see that the p-value of the interaction effect term of 'Weight_Gain' and 'Duration' suggests that the Null Hypothesis is rejected in this case.

## Key Result: P-Value
### In these results, you can conclude the following, based on the p-values and a significance level of 0.05:
