In [36]:
# --- Setup & Imports ---
import os
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, trustworthiness
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Optional libs; used if installed
try:
    import umap
    HAVE_UMAP = True
except Exception:
    HAVE_UMAP = False

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models
    HAVE_TF = True
except Exception:
    HAVE_TF = False

# --- Settings ---
RED_CSV = "winequality-red.csv"
WHITE_CSV = "winequality-white.csv"

# Save results in same folder as inputs
INPUT_DIR = os.path.dirname(os.path.abspath(RED_CSV))
OUT_DIR = INPUT_DIR

RANDOM_STATE = 42
SAMPLE_FOR_NONLINEAR = 800
TSNE_PERPLEXITY = 30

# Helper plotting function
def save_scatter(X2, labels, title, fname, legend_title='label'):
    plt.figure(figsize=(7,5))
    unique = np.unique(labels)
    for u in unique:
        mask = (labels == u)
        plt.scatter(X2[mask,0], X2[mask,1], s=8, label=str(u))
    plt.legend(title=legend_title, bbox_to_anchor=(1.05,1), loc='upper left', fontsize='small')
    plt.title(title)
    plt.xlabel('Dim1'); plt.ylabel('Dim2')
    plt.tight_layout()
    path = os.path.join(OUT_DIR, fname)
    plt.savefig(path, dpi=150)
    plt.close()
    print("Saved:", path)


In [37]:
# --- Load Data ---
def read_csv_flex(path):
    try:
        df = pd.read_csv(path, sep=';')
        if df.shape[1] <= 2:
            df = pd.read_csv(path, sep=',')
        return df
    except Exception:
        return pd.read_csv(path, sep=',', engine='python')

red = read_csv_flex(RED_CSV)
white = read_csv_flex(WHITE_CSV)

# Add type labels
red['wine_type'] = 'red'
white['wine_type'] = 'white'
df = pd.concat([red, white], ignore_index=True)

print("Loaded rows:", df.shape[0], "| Columns:", df.shape[1])
display(df.head())


Loaded rows: 6497 | Columns: 14


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0,red,
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5.0,red,
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5.0,red,
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6.0,red,
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5.0,red,


In [38]:
# --- Data Cleaning ---
missing_per_col = df.isna().sum()
print("Missing per column:\n", missing_per_col)

# Impute if necessary
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if missing_per_col.sum() > 0:
    imputer = SimpleImputer(strategy='median')
    df[num_cols] = imputer.fit_transform(df[num_cols])
    print("Imputed missing values with median.")

# Duplicates
print("Duplicate rows:", df.duplicated().sum())

# Outlier detection
def iqr_outlier_counts(df, cols):
    counts = {}
    for c in cols:
        q1 = df[c].quantile(0.25)
        q3 = df[c].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        counts[c] = int(((df[c] < lower) | (df[c] > upper)).sum())
    return counts

outliers = iqr_outlier_counts(df, num_cols)
print("Outliers per column:", outliers)

# Winsorize extremes
df_winsor = df.copy()
for c in num_cols:
    low = df_winsor[c].quantile(0.01)
    high = df_winsor[c].quantile(0.99)
    df_winsor[c] = df_winsor[c].clip(lower=low, upper=high)

display(df_winsor.describe().T)


Missing per column:
 fixed acidity                                                                                                                                                              4898
volatile acidity                                                                                                                                                           4898
citric acid                                                                                                                                                                4898
residual sugar                                                                                                                                                             4898
chlorides                                                                                                                                                                  4898
free sulfur dioxide                                                                                

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,6497.0,7.998184,0.808717,6.0,7.9,7.9,7.9,12.0
volatile acidity,6497.0,0.521187,0.080859,0.26,0.52,0.52,0.52,0.87
citric acid,6497.0,0.262127,0.094186,0.0,0.26,0.26,0.26,0.6304
residual sugar,6497.0,2.261636,0.477189,1.5,2.2,2.2,2.2,5.6
chlorides,6497.0,0.079973,0.010802,0.052,0.079,0.079,0.079,0.15204
free sulfur dioxide,6497.0,14.388872,4.726653,4.0,14.0,14.0,14.0,37.0
total sulfur dioxide,6497.0,39.890103,15.239453,11.0,38.0,38.0,38.0,119.0
density,6497.0,0.996749,0.000837,0.99346,0.99675,0.99675,0.99675,1.000101
pH,6497.0,3.310089,0.068886,3.05,3.31,3.31,3.31,3.58
sulphates,6497.0,0.627342,0.066887,0.46,0.62,0.62,0.62,0.97


In [39]:
# --- Scaling & Feature Reduction ---
scaler_rob = RobustScaler()
X_scaled = scaler_rob.fit_transform(df_winsor[num_cols])

# Variance threshold
vt = VarianceThreshold(threshold=0.01)
vt.fit(X_scaled)
low_var = [num_cols[i] for i, keep in enumerate(vt.get_support()) if not keep]
print("Low-variance features:", low_var)

# Correlation check
corr = pd.DataFrame(X_scaled, columns=num_cols).corr().abs()
high_corr_pairs = [(a, b, float(corr.loc[a,b]))
                   for i,a in enumerate(corr.columns)
                   for j,b in enumerate(corr.columns) if j>i and corr.loc[a,b]>0.9]
print("Highly correlated pairs >0.90:", high_corr_pairs)

to_drop = set([b for a,b,_ in high_corr_pairs])
features_reduced = [c for c in num_cols if c not in to_drop and c not in low_var]
print("Final features:", features_reduced)

X_reduced = pd.DataFrame(X_scaled, columns=num_cols)[features_reduced].values

# Save reduced dataset
preproc_csv = os.path.join(OUT_DIR, "wine_preprocessed_reduced.csv")
pd.DataFrame(X_reduced, columns=features_reduced).to_csv(preproc_csv, index=False)
print("Saved reduced dataset:", preproc_csv)


Low-variance features: ['volatile acidity', 'citric acid', 'chlorides', 'density', 'pH', 'sulphates']
Highly correlated pairs >0.90: []
Final features: ['fixed acidity', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'alcohol', 'quality']
Saved reduced dataset: c:\Users\user\Desktop\data set2\wine_preprocessed_reduced.csv


In [40]:
# --- Dimensionality Reduction ---
pca2 = PCA(n_components=2, random_state=RANDOM_STATE)
emb_pca2 = pca2.fit_transform(X_reduced)

pca3 = PCA(n_components=3, random_state=RANDOM_STATE)
emb_pca3 = pca3.fit_transform(X_reduced)

pca95 = PCA(n_components=0.95, random_state=RANDOM_STATE).fit(X_reduced)
print("PCA components for 95% variance:", pca95.n_components_)

# Sampling
n_samples = min(SAMPLE_FOR_NONLINEAR, X_reduced.shape[0])
if 'quality' in df_winsor.columns:
    sampled_idx = df_winsor.groupby('quality', group_keys=False).apply(
        lambda x: x.sample(max(1, int(np.ceil(n_samples*len(x)/len(df_winsor)))), random_state=RANDOM_STATE)
    ).index
    sampled_idx = np.array(sampled_idx)[:n_samples]
else:
    np.random.seed(RANDOM_STATE)
    sampled_idx = np.random.choice(X_reduced.shape[0], size=n_samples, replace=False)

X_sample = X_reduced[sampled_idx]
labels_sample_quality = df_winsor['quality'].values[sampled_idx]
labels_sample_type = df_winsor['wine_type'].values[sampled_idx]

# Run t-SNE
print("Running t-SNE...")
tsne = TSNE(n_components=2, random_state=RANDOM_STATE, init='pca',
            learning_rate='auto', perplexity=TSNE_PERPLEXITY)
emb_tsne = tsne.fit_transform(X_sample)

# Run UMAP if available
if HAVE_UMAP:
    print("Running UMAP...")
    umap_model = umap.UMAP(n_components=2, random_state=RANDOM_STATE)
    emb_umap = umap_model.fit_transform(X_sample)
else:
    emb_umap = None


PCA components for 95% variance: 1
Running t-SNE...


In [41]:
# --- Visualizations ---
save_scatter(emb_pca2[sampled_idx], labels_sample_quality,
             "PCA (2D) — quality", "pca_quality.png", legend_title='quality')

save_scatter(emb_pca2[sampled_idx], labels_sample_type,
             "PCA (2D) — type", "pca_type.png", legend_title='wine_type')

save_scatter(emb_tsne, labels_sample_quality,
             "t-SNE (2D) — quality", "tsne_quality.png", legend_title='quality')

if emb_umap is not None:
    save_scatter(emb_umap, labels_sample_quality,
                 "UMAP (2D) — quality", "umap_quality.png", legend_title='quality')

# PCA 3D
from mpl_toolkits.mplot3d import Axes3D  # noqa
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
for lab in np.unique(labels_sample_quality):
    mask = (labels_sample_quality == lab)
    ax.scatter(emb_pca3[sampled_idx][mask,0],
               emb_pca3[sampled_idx][mask,1],
               emb_pca3[sampled_idx][mask,2], s=6, label=str(lab))
ax.set_title("PCA (3D) — quality")
ax.set_xlabel("PC1"); ax.set_ylabel("PC2"); ax.set_zlabel("PC3")
ax.legend(bbox_to_anchor=(1.05,1), loc='upper left', fontsize='small')
p3file = os.path.join(OUT_DIR, "pca_3d_quality.png")
plt.tight_layout()
plt.savefig(p3file, dpi=150)
plt.close()
print("Saved:", p3file)


Saved: c:\Users\user\Desktop\data set2\pca_quality.png
Saved: c:\Users\user\Desktop\data set2\pca_type.png
Saved: c:\Users\user\Desktop\data set2\tsne_quality.png
Saved: c:\Users\user\Desktop\data set2\pca_3d_quality.png
