In [None]:
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import hopsworks
import pandas as pd

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

In [None]:
red_wine_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";")

red_wine_df

In [None]:
white_wine_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";")

white_wine_df

In [None]:
red_wine_df['wine_type'] = 0
white_wine_df['wine_type'] = 1

# Combine the two dataframes
wine_df = pd.concat([red_wine_df, white_wine_df], ignore_index=True)
wine_df = wine_df.reset_index(drop=True)
wine_df

In [None]:
# Checking for Missing Values
print(wine_df.isnull().sum())

In [None]:
wine_df.info()

In [None]:
wine_df.describe()

In [None]:
wine_df['quality'].value_counts()

### Exploratory Data Analysis (EDA)  our Iris Data

Let's look at our iris flower - the distribution and range of values for the 4 different features
 * sepal_length
 * sepal_width
 * petal_length
 * petal_width
 
 and the target variable is `variety`.

### Visualize range of values 

We want to design a simulator generate the 3 types of iris flower (setosa, versicolor, virginica).
To do this, we can visualize the range of values for the length and width of the sepal and petal for each of the 3 flowers.

In [None]:
X = wine_df.drop(['quality'], axis=1)
Y = wine_df['quality']

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
# transform the dataset
X, Y = oversample.fit_resample(X, Y)

In [None]:
wine_df = pd.concat([X, pd.DataFrame(Y, columns=['quality'])], axis=1)

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create a heatmap using seaborn
plt.figure(figsize=(10, 8))
corr = wine_df.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap='coolwarm')

In [None]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming wine_df is your DataFrame with 12 features and a target variable
# Replace 'target_column' with the actual name of your target variable column
X = wine_df.drop('quality', axis=1)
y = wine_df['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Choose a model for RFE (replace LogisticRegression with your preferred model)
model = LogisticRegression(max_iter=1000)

# Set up RFECV with the chosen model
rfecv = RFECV(estimator=model, step=1, cv=3)  # cv is the number of cross-validation folds

# Fit RFECV on the scaled training data
rfecv.fit(X_train_scaled, y_train)

# Get the selected features
selected_features = X_train.columns[rfecv.support_]

# Print the selected features
print("Optimal Number of Features: ", rfecv.n_features_)
print("Selected Features: ", selected_features)

In [None]:
df['quality'].value_counts()

In [None]:
g = sns.violinplot(y='fixed acidity', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='volatile acidity', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='citric acid', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='residual sugar', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='chlorides', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='free sulfur dioxide', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='total sulfur dioxide', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='density', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='pH', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='sulphates', x='quality', data=wine_df, inner='quartile')
plt.show()
g = sns.violinplot(y='alcohol', x='quality', data=wine_df, inner='quartile')
plt.show()

In [None]:
#DROPPING
wine_df = wine_df.drop(['total sulfur dioxide'], axis=1)
#white_wine_df = white_wine_df.drop(['residual sugar', 'free sulfur dioxide', 'pH', 'citric acid', 'sulphates'], axis=1)

wine_df.columns = wine_df.columns.str.replace(' ', '_')
#white_wine_df.columns = white_wine_df.columns.str.replace(' ', '_')

### Insert our Iris DataFrame into a FeatureGroup
DROPPING BELOW 0.1 absolute value. FOR WHITE PH and citric acid are extra irrelevant. XGBOOST???

In [None]:
wine_fg = fs.get_or_create_feature_group(
    name="wine2",
    version=1,
    primary_key=["density","volatile_acidity","chlorides", "alcohol", "wine_type","fixed_acidity","citric_acid","sulphates","free_sulfur_dioxide","pH","residual_sugar"], 
    description="Wine dataset")
wine_fg.insert(wine_df)

### Data Validation
If you want, you can enable data validation for your feature group.
The code below will prevent iris flower data from being written your your feature group if you write values outside the expected ranges.

In [None]:
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

def expect(suite, column, min_val, max_val):
    suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column":column, 
            "min_value":min_val,
            "max_value":max_val,
        }
    )
)

In [None]:
suite = ExpectationSuite(expectation_suite_name="red_wine_dimensions")

expect(suite, "fixed_acidity", 4.6, 15.9)
expect(suite, "volatile_acidity", 0.12, 1.58)
expect(suite, "citric_acid", 0, 1)
expect(suite, "chlorides", 0.012, 0.611)
expect(suite, "total_sulfur_dioxide", 6, 289)
expect(suite, "density", 0.99007, 1.00369)
expect(suite, "sulphates", 0.33, 2)
expect(suite, "alcohol", 8.4, 14.9)
red_wine_fg.save_expectation_suite(expectation_suite=suite, validation_ingestion_policy="STRICT")

suite1 = ExpectationSuite(expectation_suite_name="white_wine_dimensions")

expect(suite1, "fixed_acidity", 3.8, 14.2)
expect(suite1, "volatile_acidity", 0.08, 1.1)
expect(suite1, "chlorides", 0.009, 0.346)
expect(suite1, "total_sulfur_dioxide", 9, 440)
expect(suite1, "density", 0.98711, 1.03898)
expect(suite1, "alcohol", 8, 14.2)
white_wine_fg.save_expectation_suite(expectation_suite=suite1, validation_ingestion_policy="STRICT")
