# Statistical Analysis
## Consumer Purchase Prediction

This notebook performs descriptive, inferential, and exploratory statistical analysis.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
import statsmodels.api as sm
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../../data/Advertisement.csv')


## 1. Descriptive Statistics


In [2]:
# Descriptive statistics for numerical variables
print("DESCRIPTIVE STATISTICS")
print("="*50)
print("\nAge Statistics:")
print(df['Age'].describe())
print(f"\nSkewness: {df['Age'].skew():.3f}")
print(f"Kurtosis: {df['Age'].kurtosis():.3f}")

print("\nEstimated Salary Statistics:")
print(df['EstimatedSalary'].describe())
print(f"\nSkewness: {df['EstimatedSalary'].skew():.3f}")
print(f"Kurtosis: {df['EstimatedSalary'].kurtosis():.3f}")

# By group (Purchased)
print("\n" + "="*50)
print("DESCRIPTIVE STATISTICS BY PURCHASE STATUS")
print("="*50)
print(df.groupby('Purchased')[['Age', 'EstimatedSalary']].describe())


DESCRIPTIVE STATISTICS

Age Statistics:
count    400.000000
mean      37.655000
std       10.482877
min       18.000000
25%       29.750000
50%       37.000000
75%       46.000000
max       60.000000
Name: Age, dtype: float64

Skewness: 0.231
Kurtosis: -0.623

Estimated Salary Statistics:
count       400.000000
mean      69742.500000
std       34096.960282
min       15000.000000
25%       43000.000000
50%       70000.000000
75%       88000.000000
max      150000.000000
Name: EstimatedSalary, dtype: float64

Skewness: 0.495
Kurtosis: -0.406

DESCRIPTIVE STATISTICS BY PURCHASE STATUS
             Age                                                     \
           count       mean       std   min   25%   50%   75%   max   
Purchased                                                             
0          257.0  32.793774  7.985844  18.0  26.0  34.0  38.0  59.0   
1          143.0  46.391608  8.612172  27.0  39.5  47.0  53.0  60.0   

          EstimatedSalary                              

## 2. Normality Tests


In [3]:
# Shapiro-Wilk test for normality
from scipy.stats import shapiro

print("NORMALITY TESTS (Shapiro-Wilk)")
print("="*50)

for col in ['Age', 'EstimatedSalary']:
    stat, p_value = shapiro(df[col])
    print(f"\n{col}:")
    print(f"  Statistic: {stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    if p_value > 0.05:
        print(f"  Result: Data appears to be normally distributed (p > 0.05)")
    else:
        print(f"  Result: Data does not appear to be normally distributed (p <= 0.05)")


NORMALITY TESTS (Shapiro-Wilk)

Age:
  Statistic: 0.9785
  p-value: 0.0000
  Result: Data does not appear to be normally distributed (p <= 0.05)

EstimatedSalary:
  Statistic: 0.9600
  p-value: 0.0000
  Result: Data does not appear to be normally distributed (p <= 0.05)


## 3. Inferential Statistics - Hypothesis Testing


In [4]:
# Test: Does age differ significantly between purchased and non-purchased groups?
print("HYPOTHESIS TESTING")
print("="*50)

# Age: Independent samples t-test
age_purchased = df[df['Purchased'] == 1]['Age']
age_not_purchased = df[df['Purchased'] == 0]['Age']

# Check normality first
_, p_age1 = shapiro(age_purchased)
_, p_age2 = shapiro(age_not_purchased)

if p_age1 > 0.05 and p_age2 > 0.05:
    # Use t-test
    t_stat, p_value = ttest_ind(age_purchased, age_not_purchased)
    print(f"\nAge Difference (t-test):")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
else:
    # Use Mann-Whitney U test
    u_stat, p_value = mannwhitneyu(age_purchased, age_not_purchased)
    print(f"\nAge Difference (Mann-Whitney U test):")
    print(f"  U-statistic: {u_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")

if p_value < 0.05:
    print(f"  Result: Significant difference (p < 0.05)")
else:
    print(f"  Result: No significant difference (p >= 0.05)")


HYPOTHESIS TESTING

Age Difference (Mann-Whitney U test):
  U-statistic: 31921.0000
  p-value: 0.0000
  Result: Significant difference (p < 0.05)


In [5]:
# Salary: Independent samples t-test
salary_purchased = df[df['Purchased'] == 1]['EstimatedSalary']
salary_not_purchased = df[df['Purchased'] == 0]['EstimatedSalary']

_, p_sal1 = shapiro(salary_purchased)
_, p_sal2 = shapiro(salary_not_purchased)

if p_sal1 > 0.05 and p_sal2 > 0.05:
    t_stat, p_value = ttest_ind(salary_purchased, salary_not_purchased)
    print(f"\nSalary Difference (t-test):")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
else:
    u_stat, p_value = mannwhitneyu(salary_purchased, salary_not_purchased)
    print(f"\nSalary Difference (Mann-Whitney U test):")
    print(f"  U-statistic: {u_stat:.4f}")
    print(f"  p-value: {p_value:.4f}")

if p_value < 0.05:
    print(f"  Result: Significant difference (p < 0.05)")
else:
    print(f"  Result: No significant difference (p >= 0.05)")



Salary Difference (Mann-Whitney U test):
  U-statistic: 25167.5000
  p-value: 0.0000
  Result: Significant difference (p < 0.05)


## 4. Chi-Square Test for Categorical Variables


In [6]:
# Chi-square test: Gender vs Purchased
contingency_table = pd.crosstab(df['Gender'], df['Purchased'])
print("CONTINGENCY TABLE: Gender vs Purchased")
print(contingency_table)

chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"\nChi-square Test:")
print(f"  Chi-square statistic: {chi2:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Degrees of freedom: {dof}")

if p_value < 0.05:
    print(f"  Result: Significant association (p < 0.05)")
else:
    print(f"  Result: No significant association (p >= 0.05)")


CONTINGENCY TABLE: Gender vs Purchased
Purchased    0   1
Gender            
Female     127  77
Male       130  66

Chi-square Test:
  Chi-square statistic: 0.5551
  p-value: 0.4562
  Degrees of freedom: 1
  Result: No significant association (p >= 0.05)


## 5. Correlation Analysis


In [7]:
# Pearson correlation
pearson_corr = df[['Age', 'EstimatedSalary', 'Purchased']].corr(method='pearson')
print("PEARSON CORRELATION")
print(pearson_corr)

# Spearman correlation (non-parametric)
spearman_corr = df[['Age', 'EstimatedSalary', 'Purchased']].corr(method='spearman')
print("\nSPEARMAN CORRELATION")
print(spearman_corr)

# Correlation significance test
from scipy.stats import pearsonr

corr_age_purchased, p_age = pearsonr(df['Age'], df['Purchased'])
corr_salary_purchased, p_salary = pearsonr(df['EstimatedSalary'], df['Purchased'])

print(f"\nCorrelation Tests:")
print(f"Age vs Purchased: r={corr_age_purchased:.4f}, p={p_age:.4f}")
print(f"Salary vs Purchased: r={corr_salary_purchased:.4f}, p={p_salary:.4f}")


PEARSON CORRELATION
                      Age  EstimatedSalary  Purchased
Age              1.000000         0.155238   0.622454
EstimatedSalary  0.155238         1.000000   0.362083
Purchased        0.622454         0.362083   1.000000

SPEARMAN CORRELATION
                      Age  EstimatedSalary  Purchased
Age              1.000000         0.124824   0.612323
EstimatedSalary  0.124824         1.000000   0.306858
Purchased        0.612323         0.306858   1.000000

Correlation Tests:
Age vs Purchased: r=0.6225, p=0.0000
Salary vs Purchased: r=0.3621, p=0.0000


## 6. ANOVA (Analysis of Variance)


In [8]:
# One-way ANOVA: Age by Purchase status
model_age = ols('Age ~ C(Purchased)', data=df).fit()
anova_table_age = sm.stats.anova_lm(model_age, typ=2)
print("ANOVA: Age by Purchase Status")
print(anova_table_age)

# One-way ANOVA: Salary by Purchase status
model_salary = ols('EstimatedSalary ~ C(Purchased)', data=df).fit()
anova_table_salary = sm.stats.anova_lm(model_salary, typ=2)
print("\nANOVA: Estimated Salary by Purchase Status")
print(anova_table_salary)


ANOVA: Age by Purchase Status
                    sum_sq     df           F        PR(>F)
C(Purchased)  16988.250031    1.0  251.742061  2.800234e-44
Residual      26858.139969  398.0         NaN           NaN

ANOVA: Estimated Salary by Purchase Status
                    sum_sq     df          F        PR(>F)
C(Purchased)  6.081638e+10    1.0  60.052579  7.772785e-14
Residual      4.030621e+11  398.0        NaN           NaN
