In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
#Load Data
dataset_path = "/wine_data.csv"
data = pd.read_csv(dataset_path)

Question1

In [None]:
# most frequent occuring wine quality
most_frequent_quality = data['quality'].mode()[0]
print(f"The most frequent occuring wine quality is: {most_frequent_quality}")
# highest and lowest numbers in the quality coloumn
highest_quality = data['quality'].max()
lowest_quality = data['quality'].min()
print(f"Highest quality is: {highest_quality}")
print(f"Lowest quality is: {lowest_quality}")

The most frequent occuring wine quality is: 5
Highest quality is: 8
Lowest quality is: 3


Question 2

In [None]:
# correlation analysis
fixed_acidity_correlation = data['fixed acidity'].corr(data['quality'])
alcohol_correlation = data['alcohol'].corr(data['quality'])
free_sulphur_dioxide_correlation = data['free sulfur dioxide'].corr(data['quality'])
print(f"Correlation between fixed acidity and quality: {fixed_acidity_correlation}")
print(f"Correlation between alcohol and quality: {alcohol_correlation}")
print(f"Correlation between free sulphur dioxide and quality: {free_sulphur_dioxide_correlation}")

Correlation between fixed acidity and quality: 0.12405164911322428
Correlation between alcohol and quality: 0.4761663239995365
Correlation between free sulphur dioxide and quality: -0.0506560572442763


Question 3

In [None]:
# average residual sugar for best and worst quality wines
avg_residual_sugar_best = data[data['quality'] == highest_quality]['residual sugar'].mean()
avg_residual_sugar_worst = data[data['quality'] == lowest_quality]['residual sugar'].mean()
print(f"The average residual for best quality wines is: {avg_residual_sugar_best}")
print(f"The average residual for worst quality wines is: {avg_residual_sugar_worst}")

The average residual for best quality wines is: 2.5777777777777775
The average residual for worst quality wines is: 2.6350000000000002


Question 4

In [None]:
# effect of volatile acidity on quality
volatile_acidity_correlation = data['volatile acidity'].corr(data['quality'])
print(f"Correlation between volatile acidity and quality: {volatile_acidity_correlation}")

Correlation between volatile acidity and quality: -0.390557780264007


Question5

In [None]:
# prepare the data
X = data.drop('quality', axis = 1)
Y = data['quality']
# split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
# decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, Y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(Y_test, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")
# random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, Y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(Y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")
# compare the accuracy scores
if dt_accuracy > rf_accuracy:
  print("Decision Tree Model performed better than Random Forest Model.")
elif rf_accuracy > dt_accuracy:
  print("Random Forest Model performed better than Decision Tree Model.")
else:
  print("Both models have the same accuracy.")

Decision Tree Accuracy: 0.559375
Random Forest Accuracy: 0.659375
Random Forest Model performed better than Decision Tree Model.
