In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier

In [16]:
df = pd.read_csv("/winequality-red.csv")

In [17]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [18]:
df.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [20]:
df['quality'].unique()

array([5, 6, 7, 4, 8, 3])

In [21]:
scaler = StandardScaler()

def ubah_numerik(df, columns):
    for col in columns:
        df[col] = scaler.fit_transform(df[[col]])

scaled_columns = [col for col in df.columns if col != 'quality']
ubah_numerik(df, scaled_columns)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,5
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,5
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,5
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,6
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,5


In [22]:
x = df.drop(columns=['quality'])
y = df['quality']
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=42, test_size=0.2)

In [23]:
from imblearn.over_sampling import SMOTE
print("Distribusi sebelum SMOTE:")
print(pd.Series(y_train).value_counts())

smote = SMOTE(random_state=42, k_neighbors=3)  # Coba turunkan k_neighbors jika perlu
X_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Distribusi setelah SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Distribusi sebelum SMOTE:
quality
5    551
6    506
7    157
4     43
8     13
3      9
Name: count, dtype: int64
Distribusi setelah SMOTE:
quality
6    551
5    551
4    551
7    551
8    551
3    551
Name: count, dtype: int64


In [24]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(x_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.36      0.50      0.42        10
           5       0.72      0.71      0.72       130
           6       0.66      0.63      0.64       132
           7       0.62      0.69      0.65        42
           8       0.20      0.20      0.20         5

    accuracy                           0.66       320
   macro avg       0.43      0.45      0.44       320
weighted avg       0.66      0.66      0.66       320



In [25]:
import pickle
pickle_out = open("QualCheck.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()

In [26]:
# from google.colab import files
# files.download("QualCheck.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>