<a href="https://colab.research.google.com/github/Mjsentiment/python-mastery-roadmap/blob/studyai/module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

nlp = en_core_web_sm.load()

def Refine(text):
    doc = nlp(text.lower())
    refined_tokens = [
        token.lemma_
        for token in doc
        if token.is_alpha and token.text not in STOP_WORDS
    ]
    return " ".join(refined_tokens)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import re

def CountVectorization(sentences):
    corpus = []
    tokenized = []
    for sentence in sentences:
        tokens = re.sub(r"[^\w\s]", "", sentence).split()
        tokenized.append(tokens)
        corpus.extend([word for word in tokens if word not in corpus])

    vectors = [
        [Counter(tokens).get(word, 0) for word in corpus]
        for tokens in tokenized
    ]
    return corpus, vectors

def TFIDFVectorization(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return vectorizer.get_feature_names_out(), tfidf_matrix.toarray()


In [None]:
import pandas as pd

df = pd.read_csv("Wine.csv")


In [None]:
df["Refined-Description"] = df["Description"].apply(Refine)

corpus, count_vecs = CountVectorization(df["Refined-Description"])
df["CountVectorizer"] = count_vecs

features, tfidf_vecs = TFIDFVectorization(df["Refined-Description"])
df["TF-IDF Vectorizer"] = tfidf_vecs.tolist()


In [None]:
print(df.columns.tolist())


['Unnamed: 0', 'winery', 'country', 'points', 'description', 'designation']


In [None]:
df.head()


Unnamed: 0.1,Unnamed: 0,winery,country,points,description,designation
0,0,Heitz,US,96,This tremendous 100% varietal wine hails from ...,Martha's Vineyard
1,1,Bodega Carmen Rodríguez,Spain,96,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva
2,2,Macauley,US,96,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest
3,3,Ponzi,US,96,"This spent 20 months in 30% new French oak, an...",Reserve
4,4,Domaine de la Bégude,France,95,"This is the top wine from La Bégude, named aft...",La Brûlade


In [None]:
df["Refined-Description"] = df["description"].apply(Refine)


In [None]:
df[["description", "Refined-Description"]].head()


Unnamed: 0,description,Refined-Description
0,This tremendous 100% varietal wine hails from ...,tremendous varietal wine hail oakville age yea...
1,"Ripe aromas of fig, blackberry and cassis are ...",ripe aroma fig blackberry cassis soften sweete...
2,Mac Watson honors the memory of a wine once ma...,mac watson honor memory wine mother tremendous...
3,"This spent 20 months in 30% new French oak, an...",spend month new french oak incorporate fruit p...
4,"This is the top wine from La Bégude, named aft...",wine la bégude name high point vineyard foot s...


In [None]:
df["Refined-Description"].head()


Unnamed: 0,Refined-Description
0,tremendous varietal wine hail oakville age yea...
1,ripe aroma fig blackberry cassis soften sweete...
2,mac watson honor memory wine mother tremendous...
3,spend month new french oak incorporate fruit p...
4,wine la bégude name high point vineyard foot s...


In [None]:
print("Total rows processed:", df.shape[0])
print("Non-null refined descriptions:", df['Refined-Description'].notnull().sum())


Total rows processed: 1990
Non-null refined descriptions: 1990
