In [1]:
import pandas as pd
import os

# Load the dataset
os.chdir("..")
data_path = 'datasets'
df = pd.read_csv(os.path.join(data_path, 'wine_quality_transformed.csv'), index_col=0)

# Display basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 998 entries, 0 to 997
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country_encoded    998 non-null    int64  
 1   points_scaled      998 non-null    float64
 2   description_clean  998 non-null    object 
 3   variety_encoded    998 non-null    int64  
 4   length_scaled      998 non-null    float64
 5   price_scaled       998 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 54.6+ KB


In [2]:
df.head()

Unnamed: 0,country_encoded,points_scaled,description_clean,variety_encoded,length_scaled,price_scaled
0,3,0.354598,good dry creek zin robust dry spicy really get...,104,0.509537,-0.136111
1,0,-1.131082,herbaceous character make seem rather thin sof...,8,-1.219625,-0.493754
2,3,-1.131082,little simple easy wealth raspberry strawberry...,77,-1.219625,-0.575456
3,3,0.057462,dry farmed vineyard treated wild yeast minimal...,61,0.077246,0.542865
4,3,0.94887,site near annapolis show preponderance dark gr...,66,-0.499141,1.345938


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Define numeric and text features
text_feature = "description_clean"
numeric_features = ["points_scaled", "variety_encoded", "length_scaled", "price_scaled"]

# Define column transformer (TF-IDF for text, scaling for numeric features)
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(), text_feature)  # Apply TF-IDF to text
    # ("num", StandardScaler(), numeric_features)  # Scale numeric features
])

# Define different models to try
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "SVM": SVC(kernel="linear", probability=True),
    "NaiveBayes": MultinomialNB()
}

# Create a pipeline with preprocessing + placeholder classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))  # Placeholder classifier
])

# Define hyperparameter grid with multiple models
param_grid = [
    {
        "clf": [RandomForestClassifier(random_state=42)],
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 10]
    },
    {
        "clf": [LogisticRegression(max_iter=500)],
        "clf__C": [0.01, 0.1, 1, 10],  # Regularization strength
    },
    {
        "clf": [SVC(kernel="linear", probability=True)],
        "clf__C": [0.01, 0.1, 1, 10],
        "clf__kernel": ['linear', 'rbf', 'poly'],
        "clf__gamma": ['scale', 'auto', 0.01, 0,1, 1],
        "clf__degree": [2, 3, 4],
    }
    # {
    #     "clf": [MultinomialNB()],
    #     "clf__alpha": [0.1, 0.5, 1.0]  # Smoothing parameter
    # }
]

# Ensure data is clean
df = df.dropna(subset=["description_clean", "country_encoded"])

# Define X and y correctly
X = df[[text_feature] + numeric_features]
y = df["country_encoded"]

# Perform Grid Search with multiple models
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Print best parameters and model
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 224 candidates, totalling 1120 fits
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('tfidf', TfidfVectorizer(),
                                                  'description_clean')])),
                ('clf',
                 SVC(C=10, degree=2, kernel='linear', probability=True))])
Best Parameters: {'clf': SVC(kernel='linear', probability=True), 'clf__C': 10, 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}


In [6]:
import joblib

best_model = grid_search.best_estimator_
joblib.dump(best_model, "wine_model.pkl")

['wine_model.pkl']

In [7]:
X

Unnamed: 0,description_clean,points_scaled,variety_encoded,length_scaled,price_scaled
0,good dry creek zin robust dry spicy really get...,0.354598,104,0.509537,-0.136111
1,herbaceous character make seem rather thin sof...,-1.131082,8,-1.219625,-0.493754
2,little simple easy wealth raspberry strawberry...,-1.131082,77,-1.219625,-0.575456
3,dry farmed vineyard treated wild yeast minimal...,0.057462,61,0.077246,0.542865
4,site near annapolis show preponderance dark gr...,0.948870,66,-0.499141,1.345938
...,...,...,...,...,...
993,made using selection fruit sourced bank serein...,-0.239674,15,-0.931431,0.408828
994,quite green also raisiny really compute seeing...,-1.725354,72,0.365440,-0.661349
995,mountain fruit key understanding cab meant uni...,1.543143,12,1.518215,1.867083
996,hat zonin family among first offer viognier so...,0.057462,100,0.077246,-0.949123


In [13]:
print(X.dtypes)

description_clean     object
points_scaled        float64
variety_encoded        int64
text_length            int64
price_log            float64
price_scaled         float64
dtype: object


In [11]:
# Convert Grid Search results into a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select relevant columns for evaluation
results_df = results_df[
    [
        "param_clf",  # Model type
        "param_clf__n_estimators",  # RF: Number of estimators
        "param_clf__max_depth",  # RF: Max depth
        "param_clf__C",  # Logistic Regression / SVM regularization
        # "param_clf__alpha",  # Naive Bayes smoothing
        "mean_test_score",  # Mean accuracy score
        "std_test_score",  # Standard deviation of accuracy score
        "param_clf__kernel",
        "param_clf__gamma",
        "param_clf__degree"
    ]
].sort_values(by="mean_test_score", ascending=False)

results_df


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [6]:
# save model

In [7]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.model_selection import GridSearchCV

# # Define numeric and text features
# text_feature = "description_clean"
# numeric_features = ["points_scaled", "variety_encoded","text_length","price_log","price_scaled"]

# # Define column transformer (TF-IDF for text, passthrough for numeric)
# preprocessor = ColumnTransformer([
#     ("tfidf", TfidfVectorizer(), text_feature),  # Apply TF-IDF to text
#     ("num", "passthrough", numeric_features)  # Keep numeric features unchanged
# ])

# # Define pipeline with preprocessing + model
# pipeline = Pipeline([
#     ("preprocessor", preprocessor),
#     ("clf", RandomForestClassifier(random_state=42))
# ])

# # Define hyperparameter grid
# param_grid = {
#     "preprocessor__tfidf__max_features": [500, 1000, 2000],  # TF-IDF vocabulary
#     "preprocessor__tfidf__ngram_range": [(1,1), (1,2)],  # Unigrams vs. bigrams
#     "preprocessor__tfidf__min_df": [1, 2],  # Min doc frequency
#     "preprocessor__tfidf__max_df": [0.9, 1.0],  # Max doc frequency
#     "preprocessor__tfidf__stop_words": [None, "english"],  # Stopword removal
#     "clf__n_estimators": [100, 200],  # RandomForest trees
#     "clf__max_depth": [None, 10],  # Max tree depth
# }

# # Ensure data is clean
# df = df.dropna(subset=["description_clean", "country_encoded"])

# # Define X and y correctly
# X = df[[text_feature] + numeric_features]  # Include both text & numeric features
# y = df["country_encoded"]

# # Perform Grid Search with correct feature processing
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)
# grid_search.fit(X, y)

# # Print best parameters
# print("Best Parameters Found:", grid_search.best_params_)


In [8]:
results = pd.DataFrame(grid_search.cv_results_)
best_three = results.sort_values(by='mean_test_score', ascending=False).head(3)

In [9]:
print(f"TOP 3:\n{best_three['mean_test_score']}")


TOP 3:
170    0.849683
203    0.849683
173    0.849683
Name: mean_test_score, dtype: float64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV



# Define numeric and text features
text_feature = "description_clean"
numeric_features = ["points_scaled", "variety_encoded", "text_length", "price_log", "price_scaled"]

# Define column transformer (TF-IDF for text, passthrough for numeric)
preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer()),  # Apply TF-IDF to text
    ("num", "passthrough", numeric_features)  # Keep numeric features unchanged
])

# Define different models to try
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "SVM": SVC(kernel="linear", probability=True),
    "NaiveBayes": MultinomialNB()
}

# Create a pipeline with a placeholder classifier (will be replaced)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))  # Placeholder, will be changed in GridSearchCV
])

# Define hyperparameter grid for different models
param_grid = [
    {
        "clf": [RandomForestClassifier(random_state=42)],
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 10]
    },
    {
        "clf": [LogisticRegression(max_iter=500)],
        "clf__C": [0.01, 0.1, 1, 10],  # Regularization strength
    },
    {
        "clf": [SVC(kernel="linear", probability=True)],
        "clf__C": [0.01, 0.1, 1, 10]
    },
    {
        "clf": [MultinomialNB()],
        "clf__alpha": [0.1, 0.5, 1.0]  # Smoothing parameter
    }
]

# Perform Grid Search with multiple models
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)

# Ensure data is clean
df = df.dropna(subset=["description_clean", "country_encoded"])

# Define X and y correctly
X = df[[text_feature] + numeric_features]
y = df["country_encoded"]

# Fit grid search
grid_search.fit(X, y)

# Print best parameters and model
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 15 candidates, totalling 45 fits


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 990, in fit_transform
    self._validate_transformers()
  File "/home/jonnyoh/.pyenv/versions/3.12.4/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 517, in _validate_transformers
    names, transformers, _ = zip(*self.transformers)
    ^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 3, got 2)
