# Wine Classification

![Wine](http://images-na.ssl-images-amazon.com/images/I/710S5SU6uYL.jpg)

# Data Exploration

In [None]:
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
#loading the dataset
df=pd.read_csv("/kaggle/input/wine-dataset/Wine dataset.csv")

In [None]:
#first 5 rows
df.head()

In [None]:
#last 5 rows
df.tail()

In [None]:
#rows and columns
df.shape

In [None]:
#product of rows and columns
df.size

In [None]:
#checking column names
df.columns

In [None]:
#checking null values
df.isnull().sum()

In [None]:
#checking duplicate rows
df.duplicated().value_counts()

# Data Visualization

In [None]:
# Select the columns for line plots
columns = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Proanthocyanins', 'Color intensity', 'Hue']

# Get unique classes
classes = df['class'].unique()

# Define different visual styles for each class
styles = ['-', '--', ':']

# Create line plots for each column, differentiated by class
for column in columns:
    plt.figure()
    for i, cls in enumerate(classes):
        # Filter the data by class
        data = df[df['class'] == cls]
        plt.plot(data.index, data[column], label=f'class {cls}', linestyle=styles[i % len(styles)])
    plt.title(f'Line Plot of {column} by Class')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()

# Show the plots
plt.show()

# Data Modeling

In [None]:
# Split the data into features (X) and the target variable (y)
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Create and train a Logistic Regression Classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# Create and train a Support Vector Machines Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Create and train a Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)

# Predict the class labels for the test set using each classifier
y_rf_pred = rf_classifier.predict(X_test)
y_lr_pred = lr_classifier.predict(X_test)
y_svm_pred = svm_classifier.predict(X_test)
y_gb_pred = gb_classifier.predict(X_test)

# Calculate the accuracy of each classifier
accuracy_rf = accuracy_score(y_test, y_rf_pred)
accuracy_lr = accuracy_score(y_test, y_lr_pred)
accuracy_svm = accuracy_score(y_test, y_svm_pred)
accuracy_gb = accuracy_score(y_test, y_gb_pred)

# Print the accuracies of each classifier
print(f"Random Forest Classifier Accuracy: {accuracy_rf}")
print(f"Logistic Regression Classifier Accuracy: {accuracy_lr}")
print(f"SVM Classifier Accuracy: {accuracy_svm}")
print(f"Gradient Boosting Classifier Accuracy: {accuracy_gb}")

# Creating Confusion Matrix

In [None]:
# Create confusion matrices
rf_cm = confusion_matrix(y_test, y_rf_pred)
lr_cm = confusion_matrix(y_test, y_lr_pred)
svm_cm = confusion_matrix(y_test, y_svm_pred)
gb_cm = confusion_matrix(y_test, y_gb_pred)

# Plot confusion matrices
plt.figure(figsize=(10, 8))

plt.subplot(221)
sns.heatmap(rf_cm, annot=True, cmap='Blues', fmt='g')
plt.title('Random Forest Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')

plt.subplot(222)
sns.heatmap(lr_cm, annot=True, cmap='Blues', fmt='g')
plt.title('Logistic Regression Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')

plt.subplot(223)
sns.heatmap(svm_cm, annot=True, cmap='Blues', fmt='g')
plt.title('Support Vector Machines Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')

plt.subplot(224)
sns.heatmap(gb_cm, annot=True, cmap='Blues', fmt='g')
plt.title('Gradient Boosting Classifier')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')

plt.tight_layout()
plt.show()

# Determining the best model

In [None]:
# Accuracy scores for each classifier
accuracy_rf = 1.0
accuracy_lr = 0.9722222222222222
accuracy_svm = 0.8055555555555556
accuracy_gb = 0.9444444444444444

# Create a dictionary to store the accuracy scores
accuracy_scores = {
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'SVM': accuracy_svm,
    'Gradient Boosting': accuracy_gb
}

# Determine the best model based on the highest accuracy score
best_model = max(accuracy_scores, key=accuracy_scores.get)

print(f"The best model is: {best_model}")

# Testing on New Data

In [None]:
# Example input for prediction
new_data = [[14.37, 1.95, 2.50, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.80, 0.86, 3.45, 1480],
            [14.13, 4.10, 2.74, 24.5, 96, 2.05, 0.76, 0.56, 1.35, 9.2, 0.61, 1.60, 560]]
# Random Forest
rf_predictions = rf_classifier.predict(new_data)
print("Gradient Boosting Predictions:", rf_predictions)