In [1]:
import pandas as pd
import os

# Load the dataset
os.chdir("..")
data_path = 'datasets'
df = pd.read_csv(os.path.join(data_path, 'wine_quality_cleaned.csv'))
df = df.drop(columns=['Unnamed: 0'])
df['description_length'] = df['description'].str.len()

df.head()

Unnamed: 0,country,description,points,price,variety,description_length
0,US,good dry creek zin robust dry spicy really get...,89,25.0,zinfandel,175
1,France,herbaceous character make wine seem rather thi...,84,20.0,bordeaux style white blend,115
2,US,little simple easy wealth raspberry strawberry...,84,19.0,rosé,110
3,US,dry farmed vineyard treated wild yeast minimal...,88,38.0,petite sirah,172
4,US,site near annapolis wine show preponderance da...,91,62.0,pinot noir,169


In [2]:
# Isolating the target variable
X = df.drop(columns = 'country')
y = df['country'].map({'US':1, 'Italy':2, "France":3, "Spain":4})

In [3]:
from sklearn.model_selection import train_test_split

# Split into test/ train, taking into account stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify = y)

## Traditional ML (NLP)

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(lowercase=True),
    MultinomialNB()
)

In [5]:
cv_results = cross_validate(pipeline_naive_bayes, X['description'], y, cv = 5, scoring = ["accuracy"])
average_recall = cv_results["test_accuracy"].mean()
round(average_recall,2)

0.64

In [6]:
pipeline_naive_bayes.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'transform_input': None,
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'multinomialnb': MultinomialNB(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True

In [7]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'tfidfvectorizer__max_df': (0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67),  # Controls max doc frequency
    'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)],  # Unigrams, bigrams, trigrams
    'multinomialnb__alpha': (0.01, 0.015, 0.02, 0.025)  # Smoothing parameter for Naïve Bayes
}


# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "accuracy",
    cv = 5,
    n_jobs=-1,
    verbose=1
)


In [8]:
grid_search.fit(X_train['description'], y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 84 candidates, totalling 420 fits
Best Score = 0.835
Best params = {'multinomialnb__alpha': 0.01, 'tfidfvectorizer__max_df': 0.61, 'tfidfvectorizer__ngram_range': (1, 2)}


In [9]:
irst_param_8362 = {'multinomialnb__alpha': 0.03, 'tfidfvectorizer__max_df': 0.87, 'tfidfvectorizer__min_df': 0.003, 'tfidfvectorizer__ngram_range': (1, 2)}

In [10]:
y_pred = grid_search.best_estimator_.predict(X_test['description'])

In [11]:
pipeline_naive_bayes_finetuned = make_pipeline(
    TfidfVectorizer(lowercase=True,
                    max_df = 0.61,
                    ngram_range=(1,2)),
    MultinomialNB(alpha = 0.015)
)
pipeline_naive_bayes_finetuned.fit(X_train['description'], y_train)

In [12]:
from sklearn.metrics import accuracy_score

y_pred = pipeline_naive_bayes_finetuned.predict(X_test['description'])
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.855


### Removing stopwords

In [13]:
def remove_additional_stopwords(text, extra_stopwords):
    tokenized = text.split()  # Assume text is already tokenized and cleaned
    filtered_text = ' '.join([word for word in tokenized if word not in extra_stopwords])
    return filtered_text

# Define the additional custom stopwords to remove
custom_stopwords = {
    "wine", "flavors", "fruit", "tannins", "cherry",
    "finish", "acidity"}


# Make copies of X_train and X_test before modifying them
X_train_original = X_train.copy()
X_test_original = X_test.copy()

# Apply stopword removal to the "description" column
X_train["description"] = X_train["description"].apply(lambda x: remove_additional_stopwords(x, custom_stopwords))
X_test["description"] = X_test["description"].apply(lambda x: remove_additional_stopwords(x, custom_stopwords))

In [14]:
pipeline_naive_bayes_finetuned.fit(X_train['description'], y_train)

In [15]:
from sklearn.metrics import accuracy_score

y_pred = pipeline_naive_bayes_finetuned.predict(X_test['description'])
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.86


## Traditional ML (multi features)

In [16]:
#robust scaler for price , standard scaler for points& description length
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to compute description length
def add_description_length(df):
    df = df.copy()  # Ensure we don’t modify original data
    df["description_length"] = df["description"].str.len()  # Compute text length
    return df

# Function to concatinate text
def concat_description_variety(df):
    df = df.copy()
    df["combined_text"] = df["description"] + " " + df["variety"]
    return df

desc_length_transformer = FunctionTransformer(add_description_length)
concat_transformer = FunctionTransformer(concat_description_variety)


ct = ColumnTransformer(
    [("ss", StandardScaler(), ["points", 'description_length']),
     ("rs", RobustScaler(), ["price"]),
     ("ohe", OneHotEncoder(handle_unknown="ignore"), ["variety"]),  # OneHotEncoder for categorical variable
     ("tfidf", TfidfVectorizer(lowercase=True,
                    max_df = 0.61,
                    ngram_range=(1,2)), "combined_text")
     ])

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the base pipeline
pipeline = Pipeline([
    ("feature_engineering", desc_length_transformer),
    ("text_concat", concat_transformer),
    ("preprocessor", ct),  # Preprocessing step from your previous code
    ("classifier", LogisticRegression(max_iter=1000))  # Placeholder, will be replaced in GridSearchCV
])

# Define parameter grid with multiple models
param_grid = [
    {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": [100, 200],  # Number of trees
        "classifier__max_depth": [None, 10, 20],  # Tree depth
    },
    {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": [100, 200],
        "classifier__learning_rate": [0.05, 0.1],  # Learning rate
    },
    {
        "classifier": [SVC()],
        "classifier__C": [0.1, 1, 10],  # Regularization strength
        "classifier__kernel": ["linear", "rbf"],  # Kernel types
    },
    {
        "classifier": [KNeighborsClassifier()],
        "classifier__n_neighbors": [3, 5, 7],  # Number of neighbors
        "classifier__weights": ["uniform", "distance"],  # Weighting strategy
    }
]

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)

# Run grid search
grid_search.fit(X_train_original, y_train)

# Get best model and score
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

# Print best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", round(best_score, 4))


Fitting 5 folds for each of 22 candidates, totalling 110 fits
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_estimators=100; total time=   1.0s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=20, classifier__n_estimators=100; total time=   1.3s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_estimators=100; total time=   1.5s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_estimators=100; total time=   1.6s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_estimators=200; total time=   1.6s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_estimators=200; total time=   1.7s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=None, classifier__n_estimators=100; total time=   2.1s
[CV] END classifier=RandomForestClassifier(), classifier__max_depth=10, classifier__n_est

In [18]:
from sklearn.metrics import accuracy_score

y_pred_grid = best_model.predict(X_test_original)
print("Accuracy: ", accuracy_score(y_test, y_pred_grid))

Accuracy:  0.875


In [19]:
import joblib
joblib.dump(best_model, "artifacts/best_model.pkl")

['artifacts/best_model.pkl']