In [1]:
import pandas as pd
import os

# Load the dataset
os.chdir("..")
data_path = 'datasets'
df = pd.read_csv(os.path.join(data_path, 'wine_quality_1000.csv'), index_col=0)

# Display basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 87131 to 31482
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   country      1000 non-null   object 
 1   description  1000 non-null   object 
 2   points       1000 non-null   int64  
 3   price        1000 non-null   float64
 4   variety      1000 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 46.9+ KB


In [2]:
df.head(5)

Unnamed: 0,country,description,points,price,variety
87131,US,"Very good Dry Creek Zin, robust and dry and sp...",89,25.0,Zinfandel
57952,France,A very herbaceous character makes this wine se...,84,20.0,Bordeaux-style White Blend
96046,US,"A little simple and easy, but there's a wealth...",84,19.0,Rosé
31920,US,From a dry-farmed vineyard and treated to wild...,88,38.0,Petite Sirah
6091,US,"From a site near Annapolis, this wine shows a ...",91,62.0,Pinot Noir


In [3]:
df.isnull().sum()

country        0
description    0
points         0
price          0
variety        0
dtype: int64

In [4]:
duplicates = df.duplicated().sum()
duplicates

2

In [5]:
df_cleaned = df.drop_duplicates()

In [6]:
import nltk
nltk.download("punkt")  # Required for word_tokenize
nltk.download("stopwords")  # Required for stopwords
nltk.download("wordnet")  # Required for Lemmatization


[nltk_data] Downloading package punkt to /home/jonnyoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jonnyoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jonnyoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
custom_stopwords = set([
    "wine", "flavor", "taste", "aroma",  # Example wine-related words
    "bottle", "vintage", "palate"       # You can add more words here
])

In [12]:
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

def clean (text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop_words = set(stopwords.words('english')) # Make stopword list
    stop_words.update(custom_stopwords)
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
    cleaned = ' '.join(lemmatized) # Join back to a string
    return cleaned

# Apply to all texts
df_cleaned['description_clean'] = df_cleaned.description.apply(clean)

df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['description_clean'] = df_cleaned.description.apply(clean)


Unnamed: 0,country,description,points,price,variety,description_clean
87131,US,"Very good Dry Creek Zin, robust and dry and sp...",89,25.0,Zinfandel,good dry creek zin robust dry spicy really get...
57952,France,A very herbaceous character makes this wine se...,84,20.0,Bordeaux-style White Blend,herbaceous character make seem rather thin sof...
96046,US,"A little simple and easy, but there's a wealth...",84,19.0,Rosé,little simple easy wealth raspberry strawberry...
31920,US,From a dry-farmed vineyard and treated to wild...,88,38.0,Petite Sirah,dry farmed vineyard treated wild yeast minimal...
6091,US,"From a site near Annapolis, this wine shows a ...",91,62.0,Pinot Noir,site near annapolis show preponderance dark gr...


In [9]:
df_cleaned.to_csv('datasets/wine_quality_cleaned.csv', index=False)