In [1]:
%autosave 0

Autosave disabled


# EXPLORATION PHASE

In [2]:
# imported libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# custom module
import wrangle as w

In [3]:
df = w.wrangle_wine()

In [4]:
df.T

AttributeError: 'tuple' object has no attribute 'T'

In [None]:
# data distribution
for col in df.columns[df.dtypes == 'float64']:
    plt.figure()
    plt.title(f'Distribution of {col}')
    sns.histplot(data=df, x=col)
    plt.show()

In [None]:
df.columns.to_list()

In [None]:
numericals = ['fixed_acidity',
             'volatile_acidity',
             'citric_acid',
             'residual_sugar',
             'chlorides',
             'free_sulfur_dioxide',
             'total_sulfur_dioxide',
             'density',
             'ph',
             'sulphates',
             'alcohol',
             'quality']

In [None]:
# sns.pairplot(data=df[numericals], hue='quality', corner=True)

In [None]:
df.columns.to_list()

In [None]:
fig, axs = plt.subplots(12,1, figsize=(4,25))
for col, ax in zip(numericals, axs):
    ax.hist(df[col])
    ax.set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

## scaled features cannot be explored thoroughly, so i am importing only the clean version to explore.

In [None]:
df = w.clean_wine()

In [None]:
df.head(2)

In [None]:
df.dtypes

In [None]:
numericals = ['fixed_acidity',
               'volatile_acidity',
               'citric_acid',
               'residual_sugar',
               'chlorides',
               'free_sulfur_dioxide',
               'total_sulfur_dioxide',
               'density',
               'ph',
               'sulphates',
               'alcohol',
               'quality',
               'total_sulfur_ratio',
               'acidity_level',
               'sugar_alcohol_ratio',
               'chlorides_ph_ratio',
               'density_ph_ratio',
               'sulfur_dioxide_level',
               'sulfates_chlorides_ratio',
               'total_acid',
               'sulfur_dioxide_chlorides_ratio',
               'residual_sugar_ph_ratio',
               'acid_ratio',
               'alcohol_ph_ratio',
               'chlorides_density_ratio',
               'total_sulfur_residual_sugar_ratio',
               'ph_chlorides_ratio',
               'alcohol_sugar_ratio',
               'density_sulfates_ratio',
               'chlorides_sulfates_ratio',
               'residual_sugar_percentage',
               'alcohol_chlorides_ratio',
               'density_sulfur_dioxide_ratio',
               'ph_sulfur_dioxide_ratio',
               'sulfur_dioxide_sugar_ratio'
              ]
categoricals = ['quality_bins', 'alcohol_bins', 'ph_bins']
explore_cols = numericals + categoricals

In [None]:
#sns.pairplot(data=df[numericals], hue='quality', corner=True)

In [None]:
for col in df.columns[df.dtypes == 'category']:
    print(df[col].value_counts())
    print()
    print('--------------------')
    print()

In [None]:
fig, axs = plt.subplots(40,1, figsize=(6,80))
for col, ax in zip(explore_cols, axs):
    ax.hist(df[col])
    ax.set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
df['alcohol_bins'] = pd.cut(df['alcohol'],[0, 8, 10, 12, 15], labels=['no_alcohol', 'Low_alcohol','medium_alcohol', 'High_alcohol'])
df['quality_bins'] = pd.cut(df['quality'], [0, 3, 7, 10], labels=['low', 'medium', 'high'])

In [None]:
sns.countplot(data=df, x='alcohol_bins', hue='quality_bins')
plt.show()

In [None]:
def eval_p(p, a=0.05, decimal_places=2):
    """
    Evaluate the p-value and print the result of hypothesis testing.

    Args:
        p (float): The p-value to evaluate.
        a (float, optional): The significance level (default is 0.05).
        decimal_places (int, optional): The number of decimal places for formatting p-value (default is 2).

    Returns:
        None
    """
    formatted_p = "{:.{precision}e}".format(p, precision=decimal_places)
    if p < a:
        print(f'\nWe reject the null hypothesis with a p-value of {formatted_p}.')
    else:
        print(f'\nWe failed to reject the null hypothesis with a p-value of {formatted_p}.')



def chi2_and_visualize(df, cat_var, target, a=0.05, decimal_places=2):
    """
    Perform chi-squared test and visualize the results.

    Args:
        df (DataFrame): The DataFrame containing the data.
        cat_var (str): The categorical variable to be tested.
        target (str): The target variable for the chi-squared test.
        a (float, optional): The significance level (default is 0.05).
        decimal_places (int, optional): The number of decimal places for formatting p-value (default is 2).

    Returns:
        None
    """
    observed = pd.crosstab(df[cat_var], df[target])
    chi2, p, degf, e = stats.chi2_contingency(observed)

    print('\n\n----------------------------------------------------------------------------------------------------\n')
    
    print(f'Chi2 Statistic: {chi2:.2f}\n')
    formatted_p = "{:.{precision}e}".format(p, precision=decimal_places)
    print(f'P-Value: {formatted_p}\n')
    # print(f'Degrees of Freedom: {degf}\n')
    # print(f'Expected: {e}\n')

    # Plotting the countplot
    sns.countplot(data=df, x=cat_var, hue=target)
    plt.title(f'Wine Quality vs. Alcohol Content Range')
    plt.xlabel(f'Alcohol Content')
    plt.ylabel(f'Wine Quality')
    plt.legend(title='Wine Quality', labels=['Low', 'Medium', 'High'])
    plt.show()

    eval_p(p)

    print('\n')

def analysis_1(df, cat_var, target, a=0.05):
    """
    Perform chi-squared test and visualize the results for quality vs. alcohol.

    Args:
        df (DataFrame): The DataFrame containing the data.
        cat_var (str): The categorical variable (alcohol bins) to be tested.
        target (str): The target variable (wine quality) for the chi-squared test.
        a (float, optional): The significance level (default is 0.05).

    Returns:
        None
    """
    chi2_and_visualize(df, cat_var, target, a=0.05)

## created categorical features for chi_2 test

In [None]:
df.columns.to_list()

## Q1. Is alcohol dependent on quality of wine
**(H$0$:) The quality of wine is NOT dependent on the alcohol content.**  
**(H$a$:) The quality of wine is dependent on the alcohol content.**

## Takeaway: 
**We can determine that the quality of wine does is not dependent on the alcohol content of the wine. We can also clearly ee that most of the wine in the dataset is comprised of medium quality wine which ranges from a rank of 4 to 6.**

In [None]:
analysis_1(df, 'alcohol_bins', 'quality_bins')

## Q2. **Is there a statistically significant difference in the mean alcohol percentage between wines of low quality and high quality?** 

> **(H$0$:) The mean alcohol percentage of low-quality wines is equal to the mean alcohol percentage of high-quality wines.**
> 
> **(H$a$:) The mean alcohol percentage of low-quality wines is not equal to the mean alcohol percentage of high-quality wines.**

In [None]:
def eval_p(p, a=0.05, decimal_places=2):
    """
    Evaluate the p-value and print the result of hypothesis testing.

    Args:
        p (float): The p-value to evaluate.
        a (float, optional): The significance level (default is 0.05).
        decimal_places (int, optional): The number of decimal places for formatting p-value (default is 2).

    Returns:
        None
    """
    formatted_p = "{:.{precision}e}".format(p, precision=decimal_places)
    if p < a:
        print(f'\nWe reject the null hypothesis with a p-value of {formatted_p}.')
    else:
        print(f'\nWe failed to reject the null hypothesis with a p-value of {formatted_p}.')

def one_sample_t_test(data, pop_mean, a=0.05):
    t, p = stats.ttest_1samp(data, pop_mean)
    result = eval_p(p)
    
    print(f'T-Statistic: {t}\n')
    print(result)

    # Plot a histogram
    plt.figure(figsize=(8, 6))
    sns.histplot(data)
    
    # Add a vertical line for the population mean
    plt.axvline(x=pop_mean, color='red', linestyle='--', label=f'Population Mean ({pop_mean:.1f})')
    
    plt.title(f'1-Sample t-test Analysis\nData: {data.name}\n')
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.legend()  # Add a legend to label the vertical line
    plt.show()

def analysis_2(df, num_col_name, a=0.05):
    num_col = df[num_col_name]
    pop_mean = df[num_col_name].mean()
    one_sample_t_test(num_col, pop_mean, a=0.05)


In [None]:
# visualize the results
# one_sample_t_test(data, pop_mean)

## Takeaway:

**The mean alcohol percentage of low-quality wines is equal to the mean alcohol percentage of high-quality wines. a larger frequency of wine is more grouped on te 9.5 to 11 range of values, so the mean value make sense for alcohol content.**


# nisha's function stats test

In [None]:
# def get_dummies(df, col_name):
#     # Create dummy variables for the specified column
#     dummies = pd.get_dummies(df[col_name], prefix=col_name)
    
#     # Drop the original column
#     df.drop(col_name, axis=1, inplace=True)
    
#     # Concatenate the dummies with the DataFrame
#     df = pd.concat([df, dummies], axis=1)
    
#     return df

# # Usage
# df = get_dummies(df, 'quality_bins')

In [None]:
# df.columns.to_list()

In [None]:
train, val, test= w.wrangle_wine()

In [None]:
df.head().T