In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error


In [None]:
# Load the datasets
df_red_wine = pd.read_csv('winequality-red.csv', sep=';')
df_white_wine = pd.read_csv('winequality-white.csv', sep=';')

# Add a column to distinguish between red and white wine
df_red_wine['type'] = 'Red'
df_white_wine['type'] = 'White'

# Concatenate the dataframes
df = pd.concat([df_red_wine, df_white_wine])

# Plot the quality distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='quality', hue='type', data=df, palette={'Red': 'red', 'White': 'grey'})
plt.xlabel('Quality')
plt.ylabel('Count')
plt.title('Quality Distribution of Red and White Wines')
plt.show()

In [None]:
print(df_red_wine['quality'].value_counts())
df_white_wine['quality'].value_counts()

In [None]:
datasets_list = [df_red_wine, df_white_wine]

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()
scaled_datasets = []

for dataset in datasets_list:
    # Define the features
    X = dataset.drop(['quality', 'type'], axis=1)

    # Define the target
    y = dataset['quality']

    scaled_X = scaler.fit_transform(X)
    scaled_dataset = pd.DataFrame(data=scaled_X, columns=X.columns)
    scaled_dataset['quality'] = y.values
    scaled_datasets.append(scaled_dataset)


scaled_datasets[0].head()
    

In [None]:
# Test correlation on the first scaled dataset
correlation_matrix = scaled_datasets[0].corr()
correlation_matrix

In [None]:
from sklearn.utils import resample

# Function to downsample the majority class
def downsample_majority_class(dataset):
    # Separate majority and minority classes
    majority_class_label = dataset['quality'].mode()[0]
    minority_class = dataset[dataset['quality'] != majority_class_label]
    majority_class = dataset[dataset['quality'] == majority_class_label]

    # Downsample majority class
    n_samples = min(len(minority_class), len(majority_class))
    majority_class_downsampled = resample(majority_class,
                                          replace=False,  # sample without replacement
                                          n_samples=n_samples,  # to match minority class
                                          random_state=42)  # reproducible results

    # Combine minority class with downsampled majority class
    downsampled_dataset = pd.concat([minority_class, majority_class_downsampled])

    return downsampled_dataset

# Downsample the majority class in both datasets
downsampled_datasets = [downsample_majority_class(dataset) for dataset in scaled_datasets]

In [None]:
print(df_red_wine['quality'].value_counts())
df_white_wine['quality'].value_counts()

In [None]:
for i in range(2):
    print(downsampled_datasets[i]['quality'].value_counts())
    


In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np

# Define the cross-validation strategy
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=423)

# Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

# Function to evaluate a model with cross-validation
def cross_validate_model(model, X, y):
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    return scores

# Evaluate models on original scaled datasets
base_metrics = []
for dataset in scaled_datasets:
    X = dataset.drop('quality', axis=1)
    y = dataset['quality']
    model = LogisticRegression(random_state=42)
    scores = cross_validate_model(model, X, y)
    base_metrics.append(scores)

# Evaluate models on downsampled datasets
downsampled_metrics = []
for dataset in downsampled_datasets:
    X = dataset.drop('quality', axis=1)
    y = dataset['quality']
    model = LogisticRegression(class_weight={cls: 1.0 for cls in y.unique()}, random_state=42)
    model.class_weight[y.mode()[0]] = 0.5  # Reduce the importance of the majority class
    scores = cross_validate_model(model, X, y)
    downsampled_metrics.append(scores)

# Calculate mean and std for both sets of models
base_metrics_mean = {metric: np.mean([m['test_' + metric] for m in base_metrics]) for metric in scoring.keys()}
base_metrics_std = {metric: np.std([m['test_' + metric] for m in base_metrics]) for metric in scoring.keys()}
downsampled_metrics_mean = {metric: np.mean([m['test_' + metric] for m in downsampled_metrics]) for metric in scoring.keys()}
downsampled_metrics_std = {metric: np.std([m['test_' + metric] for m in downsampled_metrics]) for metric in scoring.keys()}

# Print the results
print("Base Models Metrics (Mean ± Std):")
for metric in scoring.keys():
    print(f"{metric.capitalize()}: {base_metrics_mean[metric]:.4f} ± {base_metrics_std[metric]:.4f}")

print("\nDownsampled Models Metrics (Mean ± Std):")
for metric in scoring.keys():
    print(f"{metric.capitalize()}: {downsampled_metrics_mean[metric]:.4f} ± {downsampled_metrics_std[metric]:.4f}")