# Project 5

### Imports

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind

import matplotlib.pyplot as plt

### Add CSV

In [None]:
df_red = pd.read_csv('winequality-red.csv', sep=';')
df_white = pd.read_csv('winequality-white.csv', sep=';')

# Add a column to distinguish between red and white wines
df_red['red'] = 1
df_white['red'] = 0

# Concatenate the two dataframes
df_wine = pd.concat([df_red, df_white], ignore_index=True)

# remove quality levels that don't meet a certain threshold
quality_counts = df_wine['quality'].value_counts()

min_records = 500 

valid_quality_levels = quality_counts[quality_counts >= min_records].index

df_wine = df_wine[df_wine['quality'].isin(valid_quality_levels)]
print(df_wine['quality'].unique())

# get rid of missing data
df_wine.dropna(inplace=True)
df_wine

### Find best features

In [None]:
numeric = [
    'fixed acidity',
    'volatile acidity',
    'citric acid',
    'residual sugar',
    'chlorides',
    'free sulfur dioxide',
    'total sulfur dioxide',
    'density',
    'pH',
    'sulphates',
    'alcohol',
    'red'
]

fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(20, 20))
axs = axs.ravel()
fig.suptitle('Numeric Features by Quality')
for i in range(len(numeric)):
    sns.kdeplot(data=df_wine, x=numeric[i], hue='quality', ax=axs[i], fill=True)
plt.tight_layout()
plt.show()

In [None]:
correlations = df_wine.corr()['quality'].sort_values(ascending=False)
correlations = correlations.drop('quality')
print(correlations)

In [None]:
correlations = correlations.apply(lambda x: abs(x)).sort_values(ascending=False)
correlations

### Perform Guassian Naive Bayes

In [None]:
list_of_features = correlations.index.tolist()

for i in range(1, len(list_of_features) + 1):

    selected_features = list_of_features[:i]
    # Use Gaussian for datasets with quantitative variables
    clf = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(df_wine[selected_features], df_wine['quality'], test_size=0.25)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
    print(df_wine['quality'].unique())
    print(f'precision={p}, recall={r}, f-score={f}, support={s}')