In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

plt.style.use('ggplot')

df = pd.read_csv('wine.csv')


print("\nBasic Data Exploration:")
print(df.head())
print(df.info())

X = df.drop('Type', axis=1)

print("\nGenerating distribution plots...")
num_features = X.columns
plt.figure(figsize=(15, 20))
for i, feature in enumerate(num_features):
    plt.subplot(7, 2, i + 1)
    sns.histplot(X[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.clf()

print("Generating correlation matrix heatmap...")
plt.figure(figsize=(12, 10))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Wine Features')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.clf()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_explained_variance, marker='o', linestyle='-', color='red')
plt.title('Scree Plot and Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.legend(['Individual Explained Variance', 'Cumulative Explained Variance'])
plt.grid(True)
plt.savefig('scree_plot.png')
plt.clf()

n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1
print(f"Number of components to explain 95% variance: {n_components}")

pca = PCA(n_components=n_components)
X_pca_transformed = pca.fit_transform(X_scaled)
print(f"Shape of PCA-transformed data: {X_pca_transformed.shape}")

kmeans_original = KMeans(n_clusters=3, random_state=42, n_init='auto')
kmeans_original.fit(X_scaled)
labels_original = kmeans_original.labels_
df['cluster_original'] = labels_original

plt.figure(figsize=(10, 8))
sns.scatterplot(x='Color', y='Alcohol', hue='cluster_original', data=df, palette='viridis', style='cluster_original', s=100)
plt.title('K-means Clustering on Original Data (Color vs. Alcohol)')
plt.xlabel('Color')
plt.ylabel('Alcohol')
plt.legend(title='Cluster')
plt.savefig('clustering_original.png')
plt.clf()

silhouette_original = silhouette_score(X_scaled, labels_original)
davies_bouldin_original = davies_bouldin_score(X_scaled, labels_original)
print(f"Silhouette Score (Original Data): {silhouette_original:.4f}")
print(f"Davies-Bouldin Index (Original Data): {davies_bouldin_original:.4f}")

kmeans_pca = KMeans(n_clusters=3, random_state=42, n_init='auto')
kmeans_pca.fit(X_pca_transformed)
labels_pca = kmeans_pca.labels_
df['cluster_pca'] = labels_pca

plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_pca_transformed[:, 0], y=X_pca_transformed[:, 1], hue=labels_pca, palette='viridis', style=labels_pca, s=100)
plt.title('K-means Clustering on PCA Data (PC1 vs. PC2)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.savefig('clustering_pca.png')
plt.clf()

silhouette_pca = silhouette_score(X_pca_transformed, labels_pca)
davies_bouldin_pca = davies_bouldin_score(X_pca_transformed, labels_pca)
print(f"Silhouette Score (PCA Data): {silhouette_pca:.4f}")
print(f"Davies-Bouldin Index (PCA Data): {davies_bouldin_pca:.4f}")

df.to_csv('wine_clustered.csv', index=False)


Basic Data Exploration:
   Type  Alcohol  Malic   Ash  Alcalinity  Magnesium  Phenols  Flavanoids  \
0     1    14.23   1.71  2.43        15.6        127     2.80        3.06   
1     1    13.20   1.78  2.14        11.2        100     2.65        2.76   
2     1    13.16   2.36  2.67        18.6        101     2.80        3.24   
3     1    14.37   1.95  2.50        16.8        113     3.85        3.49   
4     1    13.24   2.59  2.87        21.0        118     2.80        2.69   

   Nonflavanoids  Proanthocyanins  Color   Hue  Dilution  Proline  
0           0.28             2.29   5.64  1.04      3.92     1065  
1           0.26             1.28   4.38  1.05      3.40     1050  
2           0.30             2.81   5.68  1.03      3.17     1185  
3           0.24             2.18   7.80  0.86      3.45     1480  
4           0.39             1.82   4.32  1.04      2.93      735  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):



Silhouette Score (Original Data): 0.2849
Davies-Bouldin Index (Original Data): 1.3892




Silhouette Score (PCA Data): 0.2987
Davies-Bouldin Index (PCA Data): 1.3363


<Figure size 1500x2000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>