In [None]:
# Task 1
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import scikit-learn libraries for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
        roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve
)

In [None]:
# Load the dataset
df = pd.read_csv('winequality-red.csv')

# Explore the first few rows
print(df.head())
print(df.info())

In [None]:
# Check for missing values
print(df.isnull().sum())
# Feature Engineering (Creating a binary classification for wine quality)
df['quality_label'] = df['quality'].apply(lambda x: 1 if x >= 6 else 0)
# Drop original 'quality' column
df.drop('quality', axis=1, inplace=True)
# Preview the dataset
print(df.head())

In [None]:
# Define features (X) and target (y)
X = df.drop('quality_label', axis=1)
y = df['quality_label']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Task 2
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
# Predict on test data
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Train Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
# Predict on test data
y_pred_gb = gb_model.predict(X_test)

In [None]:
# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

# Evaluate Gradient Boosting
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_precision = precision_score(y_test, y_pred_gb)
gb_recall = recall_score(y_test, y_pred_gb)
gb_f1 = f1_score(y_test, y_pred_gb)

# Print results for both models
print(f"Random Forest - Accuracy: {rf_accuracy:.4f}, Precision: {rf_precision:.4f}, Recall: {rf_recall:.4f}, F1: {rf_f1:.4f}")
print(f"Gradient Boosting - Accuracy: {gb_accuracy:.4f}, Precision: {gb_precision:.4f}, Recall: {gb_recall:.4f}, F1: {gb_f1:.4f}")

In [None]:
# Task 3
# Confusion Matrix for Random Forest
rf_cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.show()

# Confusion Matrix for Gradient Boosting
gb_cm = confusion_matrix(y_test, y_pred_gb)
sns.heatmap(gb_cm, annot=True, fmt='d', cmap='Greens')
plt.title('Gradient Boosting Confusion Matrix')
plt.show()

In [None]:
# ROC Curve for Random Forest
rf_prob = rf_model.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_prob)
plt.plot(rf_fpr, rf_tpr, label='Random Forest')

# ROC Curve for Gradient Boosting
gb_prob = gb_model.predict_proba(X_test)[:, 1]
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_prob)
plt.plot(gb_fpr, gb_tpr, label='Gradient Boosting')

# Plot ROC Curves
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Feature importance for Random Forest
rf_feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
rf_feature_importance.nlargest(10).plot(kind='barh', title='Random Forest - Top 10 Important Features')
plt.show()

# Feature importance for Gradient Boosting
gb_feature_importance = pd.Series(gb_model.feature_importances_, index=X.columns)
gb_feature_importance.nlargest(10).plot(kind='barh', title='Gradient Boosting - Top 10 Important Features', color='green')
plt.show()

In [None]:
# Precision-Recall Curve for Random Forest
rf_prob = rf_model.predict_proba(X_test)[:, 1]
rf_precision, rf_recall, _ = precision_recall_curve(y_test, rf_prob)
plt.plot(rf_recall, rf_precision, label='Random Forest')

# Precision-Recall Curve for Gradient Boosting
gb_prob = gb_model.predict_proba(X_test)[:, 1]
gb_precision, gb_recall, _ = precision_recall_curve(y_test, gb_prob)
plt.plot(gb_recall, gb_precision, label='Gradient Boosting')

# Plot Precision-Recall Curves
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

In [None]:
# F1-Score vs. Threshold for Random Forest
thresholds = np.arange(0.1, 1, 0.1)
f1_scores_rf = []
f1_scores_gb = []

for threshold in thresholds:
    rf_pred_threshold = (rf_prob >= threshold).astype(int)
    gb_pred_threshold = (gb_prob >= threshold).astype(int)
            
    f1_rf = f1_score(y_test, rf_pred_threshold)
    f1_gb = f1_score(y_test, gb_pred_threshold)
                        
    f1_scores_rf.append(f1_rf)
    f1_scores_gb.append(f1_gb)

# Plot F1-Score vs Threshold
plt.plot(thresholds, f1_scores_rf, label='Random Forest')
plt.plot(thresholds, f1_scores_gb, label='Gradient Boosting', color='green')
plt.xlabel('Threshold')
plt.ylabel('F1-Score')
plt.title('F1-Score vs. Decision Threshold')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

def plot_cumulative_gains(y_true, y_pred_proba, model_name):
    sorted_indices = np.argsort(y_pred_proba)[::-1]
    sorted_true = np.array(y_true)[sorted_indices]

    cumulative_gains = np.cumsum(sorted_true) / np.sum(sorted_true)
    random_line = np.arange(0, 1, 1 / len(y_true))

    plt.plot(cumulative_gains, label=f'Cumulative Gains - {model_name}')
    plt.plot(random_line, '--', label='Random')
    plt.title('Cumulative Gains Chart')
    plt.xlabel('Proportion of data examined')
    plt.ylabel('Proportion of true positives')
    plt.legend()
    plt.show()

# Gain and lift chart for Random Forest
plot_cumulative_gains(y_test, rf_prob, 'Random Forest')

# Gain and lift chart for Gradient Boosting
plot_cumulative_gains(y_test, gb_prob, 'Gradient Boosting')
        

In [None]:
from math import pi

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
rf_metrics = [accuracy_score(y_test, rf_model.predict(X_test)), precision_score(y_test, rf_model.predict(X_test), average='weighted'), recall_score(y_test, rf_model.predict(X_test), average='weighted'), f1_score(y_test, rf_model.predict(X_test), average='weighted')]

gb_metrics = [accuracy_score(y_test, gb_model.predict(X_test)), precision_score(y_test, gb_model.predict(X_test), average='weighted'), recall_score(y_test, gb_model.predict(X_test), average='weighted'), f1_score(y_test, gb_model.predict(X_test), average='weighted')]

metrics_data = pd.DataFrame({
    'Metric': metrics,
    'Random Forest': rf_metrics,
    'Gradient Boosting': gb_metrics
})
def spider_plot(metrics_data):
    categories = list(metrics_data['Metric'])
    N = len(categories)
    # Create radar chart for each model
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    for model_name in ['Random Forest', 'Gradient Boosting']:
        values = metrics_data[model_name].tolist()
        values += values[:1]  # Closing the plot
        angles = [n / float(N) * 2 * pi for n in range(N)]
        angles += angles[:1]
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=model_name)
        ax.fill(angles, values, alpha=0.1)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)
    ax.set_title('Model Performance Comparison', size=20)
    ax.legend()
    plt.show()
# Create spider plot
spider_plot(metrics_data)                                