In [None]:
# Task 1
# # Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

In [None]:
# Load the Wine Quality dataset
data = pd.read_csv('winequality-red.csv')
# Display the first few rows of the dataset
print(data.head())
# Display information about the dataset
print(data.info())

In [None]:
# Plot the distribution of the target variable
sns.countplot(x='quality', data=data)
plt.title('Distribution of Wine Quality Ratings')
plt.show()

In [None]:
# Plot the distribution of important features
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'alcohol']
plt.figure(figsize=(14, 8))
for i, feature in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Pairplot to visualize relationships between features and the target
sns.pairplot(data, diag_kind='kde', hue='quality', markers='.')
plt.show()

In [None]:
#Task 2
# Feature Engineering: Create acidity_ratio
data['acidity_ratio'] = data['fixed acidity'] / data['volatile acidity']
# Display the first few rows to confirm the new feature
print(data[['fixed acidity', 'volatile acidity', 'acidity_ratio']].head())

In [None]:
# List of numeric features to scale
numeric_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'acidity_ratio']

# Apply StandardScaler to numeric features
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

# Display scaled features
print(data[numeric_features].head())

In [None]:
# Define features and target variable
X = data.drop('quality', axis=1)
y = data['quality']
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape:", X_train.shape, f"Testing set shape:", X_test.shape)

In [None]:
#Task 3
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function to evaluate the models
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    return accuracy, precision, recall, f1

# Function to print the confusion matrix
def print_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression
lr_model = LogisticRegression(max_iter=200)

# Train the model
lr_model.fit(X_train, y_train)
# Predict on the test set
lr_predictions = lr_model.predict(X_test)
# Evaluate the Logistic Regression model
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(y_test, lr_predictions)
print(f"Logistic Regression - Accuracy: {lr_accuracy:.2f}, Precision: {lr_precision:.2f}, Recall: {lr_recall:.2f}, F1 Score: {lr_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, lr_predictions)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
# Train the model
dt_model.fit(X_train, y_train)
# Predict on the test set
dt_predictions = dt_model.predict(X_test)
# Evaluate the Decision Tree model
dt_accuracy, dt_precision, dt_recall, dt_f1 = evaluate_model(y_test, dt_predictions)
print(f"Decision Tree - Accuracy: {dt_accuracy:.2f}, Precision: {dt_precision:.2f}, Recall: {dt_recall:.2f}, F1 Score: {dt_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, dt_predictions)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_model.fit(X_train, y_train)
# Predict on the test set
rf_predictions = rf_model.predict(X_test)
# Evaluate the Random Forest model
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(y_test, rf_predictions)
print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, Precision: {rf_precision:.2f}, Recall: {rf_recall:.2f}, F1 Score: {rf_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, rf_predictions)

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize Naive Bayes
nb_model = GaussianNB()
# Train the model
nb_model.fit(X_train, y_train)
# Predict on the test set
nb_predictions = nb_model.predict(X_test)
# Evaluate the Naive Bayes model
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(y_test, nb_predictions)
print(f"Naive Bayes - Accuracy: {nb_accuracy:.2f}, Precision: {nb_precision:.2f}, Recall: {nb_recall:.2f}, F1 Score: {nb_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, nb_predictions)

In [None]:
from sklearn.svm import SVC

# Initialize SVM
svm_model = SVC(kernel='rbf', random_state=42)
# Train the model
svm_model.fit(X_train, y_train)
# Predict on the test set
svm_predictions = svm_model.predict(X_test)
# Evaluate the SVM model
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(y_test, svm_predictions)
print(f"SVM - Accuracy: {svm_accuracy:.2f}, Precision: {svm_precision:.2f}, Recall: {svm_recall:.2f}, F1 Score: {svm_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, svm_predictions)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
# Train the model
knn_model.fit(X_train, y_train)
# Predict on the test set
knn_predictions = knn_model.predict(X_test)
# Evaluate the KNN model
knn_accuracy, knn_precision, knn_recall, knn_f1 = evaluate_model(y_test, knn_predictions)
print(f"KNN - Accuracy: {knn_accuracy:.2f}, Precision: {knn_precision:.2f}, Recall: {knn_recall:.2f}, F1 Score: {knn_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, knn_predictions)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
# Train the model
gb_model.fit(X_train, y_train)
# Predict on the test set
gb_predictions = gb_model.predict(X_test)
# Evaluate the Gradient Boosting model
gb_accuracy, gb_precision, gb_recall, gb_f1 = evaluate_model(y_test, gb_predictions)
print(f"Gradient Boosting - Accuracy: {gb_accuracy:.2f}, Precision: {gb_precision:.2f}, Recall: {gb_recall:.2f}, F1 Score: {gb_f1:.2f}")
# Confusion matrix
print_confusion_matrix(y_test, gb_predictions)

In [None]:
#Task 4
# Compare model performance
results = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Naive Bayes', 'SVM', 'KNN', 'Gradient Boosting'],
    'Accuracy': [lr_accuracy, dt_accuracy, rf_accuracy, nb_accuracy, svm_accuracy, knn_accuracy, gb_accuracy],
    'Precision': [lr_precision, dt_precision, rf_precision, nb_precision, svm_precision, knn_precision, gb_precision],
    'Recall': [lr_recall, dt_recall, rf_recall, nb_recall, svm_recall, knn_recall, gb_recall],
    'F1 Score': [lr_f1, dt_f1, rf_f1, nb_f1, svm_f1, knn_f1, gb_f1]
}

results_df = pd.DataFrame(results)
print(results_df)