import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
white_wine = pd.read_csv('/Users/micah/Desktop/winequality-white.csv', sep=';')
red_wine = pd.read_csv('/Users/micah/Desktop/winequality-red.csv', sep=';')

# 1. Check for missing values
print("Missing values in white wine:")
print(white_wine.isnull().sum())
print("\nMissing values in red wine:")
print(red_wine.isnull().sum())

# 2. Check for outliers using IQR method
def detect_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    return len(outliers)

# Check outliers for each feature
numeric_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
                   'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
                   'pH', 'sulphates', 'alcohol']

print("\nOutlier counts for each feature:")
for feature in numeric_features:
    print(f"{feature}: {detect_outliers(white_wine, feature)} outliers in white wine")
    print(f"{feature}: {detect_outliers(red_wine, feature)} outliers in red wine")

# 3. Feature scaling
scaler = StandardScaler()

# Scale features for white wine
X_white = white_wine.drop('quality', axis=1)
y_white = white_wine['quality']
X_white_scaled = scaler.fit_transform(X_white)
X_white_scaled = pd.DataFrame(X_white_scaled, columns=X_white.columns)

# Scale features for red wine
X_red = red_wine.drop('quality', axis=1)
y_red = red_wine['quality']
X_red_scaled = scaler.fit_transform(X_red)
X_red_scaled = pd.DataFrame(X_red_scaled, columns=X_red.columns)

# 4. Feature selection using SelectKBest
def select_features(X, y, k=5):
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    return selected_features

# Select top 5 features for each dataset
white_selected_features = select_features(X_white_scaled, y_white)
red_selected_features = select_features(X_red_scaled, y_red)

print("\nTop 5 features for white wine:", white_selected_features)
print("Top 5 features for red wine:", red_selected_features)

# 5. Split data into training and testing sets
# For white wine
X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(
    X_white_scaled, y_white, test_size=0.2, random_state=42
)

# For red wine
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(
    X_red_scaled, y_red, test_size=0.2, random_state=42
)

# 6. Check class imbalance
print("\nClass distribution in white wine:")
print(y_white.value_counts(normalize=True))
print("\nClass distribution in red wine:")
print(y_red.value_counts(normalize=True))

# Visualize class distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x=y_white)
plt.title('White Wine Quality Distribution')

plt.subplot(1, 2, 2)
sns.countplot(x=y_red)
plt.title('Red Wine Quality Distribution')

plt.tight_layout()
plt.show()

# Save preprocessed data
preprocessed_data = {
    'white_wine': {
        'X_train': X_white_train,
        'X_test': X_white_test,
        'y_train': y_white_train,
        'y_test': y_white_test,
        'selected_features': white_selected_features
    },
    'red_wine': {
        'X_train': X_red_train,
        'X_test': X_red_test,
        'y_train': y_red_train,
        'y_test': y_red_test,
        'selected_features': red_selected_features
    }
}

# You can save this dictionary using pickle if needed
import pickle
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)