In [None]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt  
import pandas as pd
import seaborn as sns

#from warnings import filterwarnings
#filterwarnings(action='ignore')

In [None]:
# Import Data 
all_wine_raw_df = pd.read_csv('./Wine_data_both.csv') 
red_wine_df = pd.read_csv('./Wine_data_red.csv')
white_wine_df = pd.read_csv('./Wine_data_white.csv')

all_wine_raw_df.head()

In [None]:
print(all_wine_raw_df.shape)

In [None]:
# Data Exploration
#basic stats across all features - description
all_wine_raw_df.describe(include='all')
#notes:
# white wine data outweighs red - will need to balance out: 4898 of 6497 rows (75%)
# quality range is 3-9, with majority at 6
# standard deviation for both free sulfur dioxide & total sulfur dioxide are high
# following features have low (less than or near 1) standard deviation: quality, alcohol, sulphates, pH, density, chlorides, citric acid, volatile acidity, fixed acid
# residual sugar has some variance: 5.4 mean, 0.6 min, 65.8 max

In [None]:
#assess if there are any nulls
print(all_wine_raw_df.isna().sum())
#there are no nulls

In [None]:


#features correlation matrix
combined_data = all_wine_raw_df.copy()
#combined_data = all_wine_raw_df.drop(columns=['Wine'], inplace=True)
combined_data.head()
combined_data.corr()
#Moderate Correlation: 0.5-0.7
#-Postive, Moderate Correlations: Total Sulfur Dioxide & Residual Sugar, Density & Fixed Acidity, Density & Residual Sugar
#-Negative, Moderate Correlations: Density & Alcohol
#-Positive, High Correlations: Free Sulfure Dioxide & Total Sulfur Dioxide (consider removing one)
#Correlations vs. Quality that matter (not weak): Density & Alcohol

In [None]:
#feature mean by quality
combined_data.groupby('quality').mean()
#fixed acidity - organic acids that contribute to taste, color, and stability of wine (data is within average)
#volatile acidity - acetic acid produced by bacteria or yeast during fermentation or aging process, considered a wine fault (contributes to pungency & sourness), should be less than 0.4 (see that higher quality has less than 0.4)
#citric acid - not important to wine quality (consumed mostly by bacteria during fermentation)
#residual sugar - enhances wine, balancing sourness caused by acids
#chlorides - not important to wine quality
#free sulfure dioxide - adds freshness & crispness (important)
#total sulfur dioxide - same as above (important)
#density - ideal range (0.99)
#pH - acidity of wine, anything less than 3 needs to be balanced out with other stuff, acidity seems to be in ideal range of 3.2 to 3.4 across all quality scores
#sulphates - wines with lower acidity need more sulphates
#alcohol - generally low but more within range reflected in higher quality scored wines (perhaps something reflected in red vs. white?)

In [None]:
#Data Analysis
#Countplot for all features
for column in combined_data.columns:
    plt.hist(combined_data[column], bins=10)  # Adjust the number of bins as needed
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {column}')
    plt.show()

#pH - normally distributed

In [None]:
#box plot
combined_data.plot(kind ='box',subplots = True, layout =(4,4),sharex = False)

In [None]:
#density plot
combined_data.plot(kind ='density',subplots = True, layout =(4,4),sharex = False)

In [None]:
#heat map for correlation
corr = combined_data.corr()
sns.heatmap(corr,annot=True)

In [None]:
#pair plot
sns.pairplot(combined_data)

In [None]:
#violin plot
sns.violinplot(x='quality', y='alcohol', data=combined_data)

In [None]:
sns.violinplot(x='quality', y='density', data=combined_data)

In [None]:
sns.violinplot(x='quality', y='fixed acidity', data=combined_data)

In [None]:
sns.violinplot(x='quality', y='residual sugar', data=combined_data)

In [None]:
sns.violinplot(x='quality', y='free sulfur dioxide', data=combined_data)

In [None]:
sns.violinplot(x='quality', y='total sulfur dioxide', data=combined_data)

In [None]:
# Build Model 
# Text interpreting and describing results
# # Export Results