In [39]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [40]:
data = pd.read_csv('winequality-white.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [41]:
y = data['quality']
X = data.drop(['quality'], axis=1) 

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [42]:
# Initialize PCA with desired number of components
pca = PCA(n_components=6)  # You can choose the number of components based on explained variance ratio or other criteria

# Fit and transform the data
X_pca = pca.fit_transform(X_scaled)


In [43]:
# Initialize LDA with desired number of components
lda = LDA(n_components=6)  # choose the number of components 

X_lda = lda.fit_transform(X_scaled, y)  # Note: LDA requires the target variable


In [44]:
print("PCA transformed data shape:", X_pca.shape)
print("LDA transformed data shape:", X_lda.shape)


PCA transformed data shape: (4898, 6)
LDA transformed data shape: (4898, 6)


In [45]:
from sklearn.model_selection import train_test_split

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)


In [46]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [47]:
# Predict quality using the trained model
y_pred = model.predict(X_test)


In [48]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.47959183673469385
Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       1.00      0.04      0.08        25
           5       0.49      0.33      0.40       291
           6       0.48      0.77      0.59       432
           7       0.47      0.21      0.29       192
           8       0.00      0.00      0.00        35

    accuracy                           0.48       980
   macro avg       0.41      0.23      0.23       980
weighted avg       0.47      0.48      0.44       980



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.51785714 0.5127551  0.50255102 0.49297573 0.52490421]
Mean cross-validation score: 0.5102086428441108


In [50]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict quality using the trained SVM model
svm_y_pred = svm_model.predict(X_test)

# Evaluate SVM model performance
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_y_pred))


SVM Accuracy: 0.5295918367346939
SVM Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.60      0.12      0.20        25
           5       0.59      0.51      0.55       291
           6       0.50      0.77      0.61       432
           7       0.56      0.18      0.28       192
           8       0.00      0.00      0.00        35

    accuracy                           0.53       980
   macro avg       0.38      0.26      0.27       980
weighted avg       0.52      0.53      0.49       980



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict quality using the trained Random Forest model
rf_y_pred = rf_model.predict(X_test)

# Evaluate Random Forest model performance
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))


Random Forest Accuracy: 0.676530612244898
Random Forest Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.58      0.28      0.38        25
           5       0.70      0.69      0.69       291
           6       0.65      0.77      0.70       432
           7       0.71      0.57      0.63       192
           8       0.94      0.43      0.59        35

    accuracy                           0.68       980
   macro avg       0.60      0.46      0.50       980
weighted avg       0.68      0.68      0.67       980

