In [1]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import nltk
nltk.download('punkt', quiet=True)      # tokenizador 
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)    # dicionário WordNet
nltk.download('omw-1.4', quiet=True)    # dados linguísticos adicionais

True

In [51]:
import re
import string
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.decomposition import LatentDirichletAllocation

# (1) Text Preprocessing

## Strip (1)

strip removes all the whitespaces at the beginning and the end of a string

In [4]:
texts = [
    '  Bonjour, comment ca va ?  ',
    '  Heyyyyy, how are you doing ?  ',
    '  Hallo, wie gehts ?  '
]
texts

['  Bonjour, comment ca va ?  ',
 '  Heyyyyy, how are you doing ?  ',
 '  Hallo, wie gehts ?  ']

In [5]:
[text.strip() for text in texts]

['Bonjour, comment ca va ?',
 'Heyyyyy, how are you doing ?',
 'Hallo, wie gehts ?']

## Strip 2

You can also specify a "list" of characters (in the form of a single and unordered string) to be removed at the
beginning and at the end of a string

In [6]:
text = "abcd Who is abcd ? That's not a real name!!! abcd"
print(text)

print(text.strip('bdac'))
print(text.strip('bdac').strip())

abcd Who is abcd ? That's not a real name!!! abcd
 Who is abcd ? That's not a real name!!! 
Who is abcd ? That's not a real name!!!


## Replace

In [7]:
text = "I love koalas, koalas are the cutest animals on Earth."
print(text)

text.replace("koala", "panda")

I love koalas, koalas are the cutest animals on Earth.


'I love pandas, pandas are the cutest animals on Earth.'

## Split

In [8]:
text = "linkin park / metallica /red hot chili peppers"
text.split("/")

['linkin park ', ' metallica ', 'red hot chili peppers']

## Lowercase

In [9]:
text = "i LOVE football sO mUch. FOOTBALL is my passion. Who else loves fOOtBaLL ?"
print(text)
text.lower()

i LOVE football sO mUch. FOOTBALL is my passion. Who else loves fOOtBaLL ?


'i love football so much. football is my passion. who else loves football ?'

## Numbers

In [10]:
text = "i do not recommend this restaurant, we waited for so long, like 30 minutes, this is ridiculous"
print(text)

cleaned_text = ''.join(char for char in text if not char.isdigit())
cleaned_text

i do not recommend this restaurant, we waited for so long, like 30 minutes, this is ridiculous


'i do not recommend this restaurant, we waited for so long, like  minutes, this is ridiculous'

## Punctuation and Symbols

In [11]:
text = "I love bubble tea! OMG so #tasty @channel XOXO @$ ^_^ "
print(text)
string.punctuation

I love bubble tea! OMG so #tasty @channel XOXO @$ ^_^ 


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
for punctuation in string.punctuation:
    text = text.replace(punctuation, '')

text

'I love bubble tea OMG so tasty channel XOXO   '

## Combo: strip + lowercase + numbers + punctuation/symbols

In [13]:
sentences = [
    " I LOVE Pizza 999 @^_^",
    " Le Wagon is amazing, take care - 666"
]

def basic_cleaning(sentence):
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())

    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    
    sentence = sentence.strip()

    return sentence

cleaned = [basic_cleaning(sentence) for sentence in sentences]
cleaned

['i love pizza', 'le wagon is amazing take care']

## Removing Tags with RegEx

We can remove HTML tags using RegEx (https://regexr.com/):

We can also extract e-mail addresses from a text:

In [14]:
text = """<head><body>Hello Le Wagon!</body></head>"""
cleaned_text = re.sub('<[^<]+?>', '', text)

print(cleaned_text)

Hello Le Wagon!


In [15]:
txt = '''
    This is a random text, authored by darkvador@gmail.com
    and batman@outlook.com, WOW!
'''

re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', txt)

['darkvador@gmail.com', 'batman@outlook.com']

## Cleaning with NLTK

Natural Language Toolkit (NLTK) is an NLP library that provides preprocessing and modeling tools for text
data
 - NLTK official website (https://www.nltk.org/)
 - Installation Documentation (https://www.nltk.org/install.html)

### Tokenizing

In [16]:
text = 'It is during our darkest moments that we must focus to see the light'
print(text)

word_tokens = word_tokenize(text)
print(word_tokens) # print displays the words in one line

It is during our darkest moments that we must focus to see the light


['It', 'is', 'during', 'our', 'darkest', 'moments', 'that', 'we', 'must', 'focus', 'to', 'see', 'the', 'light']


### Stopwords

In [17]:
stop_words = set(stopwords.words('english')) # you can also choose other languages

tokens = [
    "i", "am", "going", "to", "go", "to", "the",
    "club", "and", "party", "all", "night", "long"
]

In [18]:
# What stopwords could be removed
stopwords_removed = [w for w in tokens if w in stop_words]
stopwords_removed

['i', 'am', 'to', 'to', 'the', 'and', 'all']

In [19]:
# What are the meaningful words in this sentence
tokens_cleaned = [w for w in tokens if not w in stop_words]
tokens_cleaned

['going', 'go', 'club', 'party', 'night', 'long']

### Lemmatizing

Let's apply the following steps:
1. Basic cleaning
2. Tokenizing
3. Removing stopwords (if not doing sentiment analysis!)
4. Lemmatizing

In [20]:
sentence = 'He was RUNNING and EATING at the same time =[. He has a bad habit of swimming after playing 3 hours in the Sun =/'
sentence

'He was RUNNING and EATING at the same time =[. He has a bad habit of swimming after playing 3 hours in the Sun =/'

In [21]:
# Step 1: Basic Cleaning
cleaned_sentence = basic_cleaning(sentence)
cleaned_sentence

'he was running and eating at the same time  he has a bad habit of swimming after playing  hours in the sun'

In [22]:
# Step 2: Tokenize
tokenized_sentence = word_tokenize(cleaned_sentence)
print(tokenized_sentence)

['he', 'was', 'running', 'and', 'eating', 'at', 'the', 'same', 'time', 'he', 'has', 'a', 'bad', 'habit', 'of', 'swimming', 'after', 'playing', 'hours', 'in', 'the', 'sun']


In [23]:
# Step 3: Remove Stopwords
tokenized_sentence_no_stopwords = [w for w in tokenized_sentence if not w in stop_words]
print(tokenized_sentence_no_stopwords)

['running', 'eating', 'time', 'bad', 'habit', 'swimming', 'playing', 'hours', 'sun']


In [24]:
# Step 4: Lemmatizing
# WordNetLemmatizer (https://www.nltk.org/_modules/nltk/stem/wordnet.html) (Only supports English)

# Lemmatizing the verbs
verb_lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos='v') # v --> verbs
    for word in tokenized_sentence_no_stopwords
]

# Lemmatizing the nouns
noun_lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos='n') # n --> nouns
    for word in verb_lemmatized
]

In [25]:
original_vs_lemmatized = pd.DataFrame({
    'original word': tokenized_sentence_no_stopwords,
    'lemmatized verbs': verb_lemmatized,
    'lemmatized nouns': noun_lemmatized
})
original_vs_lemmatized.style.hide(axis='index')

original word,lemmatized verbs,lemmatized nouns
running,run,run
eating,eat,eat
time,time,time
bad,bad,bad
habit,habit,habit
swimming,swim,swim
playing,play,play
hours,hours,hour
sun,sun,sun


# (2) Vectorizing

## Bag-of-Words (BoW)

In [26]:
# In Scikit-Learn, there is a tool called CountVectorizer to generate bag-of-words representations of a set of texts

texts = [
    'the young dog is running with the cat',
    'running is good for your health',
    'your cat is young',
    'young young young young young cat cat cat'
]

count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(texts)
X.toarray()

array([[1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
       [3, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0]])

In [27]:
count_vectorizer.get_feature_names_out()

array(['cat', 'dog', 'for', 'good', 'health', 'is', 'running', 'the',
       'with', 'young', 'your'], dtype=object)

In [28]:
vectorized_texts = pd.DataFrame(
    X.toarray(),
    columns = count_vectorizer.get_feature_names_out(),
    index = texts
)

vectorized_texts

Unnamed: 0,cat,dog,for,good,health,is,running,the,with,young,your
the young dog is running with the cat,1,1,0,0,0,1,1,2,1,1,0
running is good for your health,0,0,1,1,1,1,1,0,0,0,1
your cat is young,1,0,0,0,0,1,0,0,0,1,1
young young young young young cat cat cat,3,0,0,0,0,0,0,0,0,5,0


## TfidfVectorizer

In [29]:
# raw documents matrix of tf-idf features

# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

# Training it on the texts
weighted_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(texts).toarray(), columns=tf_idf_vectorizer.get_feature_names_out())

weighted_words

Unnamed: 0,cat,dog,for,good,health,is,running,the,with,young,your
0,0.227904,0.357056,0.0,0.0,0.0,0.227904,0.281507,0.714112,0.357056,0.227904,0.0
1,0.0,0.0,0.463709,0.463709,0.463709,0.29598,0.365594,0.0,0.0,0.0,0.365594
2,0.470063,0.0,0.0,0.0,0.0,0.470063,0.0,0.0,0.0,0.470063,0.580622
3,0.514496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857493,0.0


### Controlling the vocabulary size

We can control the number of words to be vectorized (curse of dimensionality (https://www.analyticsvidhya.com/blog/2021/04/the-curse-of-dimensionality-in-machine-learning/)!)

In [30]:
# Scikit-Learn allows us to customize the CountVectorizer and TfidfVectorizer with key parameters to control vocabulary size
# Key parameters of the models: max_df / min_df; max_features

document_frequency = (weighted_words > 0).sum(axis=0)

df_row = pd.DataFrame([document_frequency], index=["document_frequency"])
df_row

Unnamed: 0,cat,dog,for,good,health,is,running,the,with,young,your
document_frequency,3,1,1,1,1,3,2,1,1,3,2


In [31]:
# How to use these parameters in practice?

# Instantiate the CountVectorizer with max_df = 2
count_vectorizer = CountVectorizer(max_df = 2) # removing "cat", "is", "young"

# Train it
X = count_vectorizer.fit_transform(texts)
X = pd.DataFrame(
    X.toarray(),
    columns = count_vectorizer.get_feature_names_out(),
    index = texts
)
X

Unnamed: 0,dog,for,good,health,running,the,with,your
the young dog is running with the cat,1,0,0,0,1,2,1,0
running is good for your health,0,1,1,1,1,0,0,1
your cat is young,0,0,0,0,0,0,0,1
young young young young young cat cat cat,0,0,0,0,0,0,0,0


In [32]:
# How to use "max_features" in practice?

# CountVectorizer with the 3 most frequent words
count_vectorizer = CountVectorizer(max_features = 3)

X = count_vectorizer.fit_transform(texts)
X = pd.DataFrame(
    X.toarray(),
    columns = count_vectorizer.get_feature_names_out(),
    index = texts
)
X

Unnamed: 0,cat,is,young
the young dog is running with the cat,1,1,1
running is good for your health,0,1,0
your cat is young,1,1,1
young young young young young cat cat cat,3,0,5


## N-grams

In [33]:
# Example: the two following sentences have the exact same representation:
actors_movie = [
    "I like the movie but NOT the actors",
    "I like the actors but NOT the movie"
]

# Vectorize the sentences
count_vectorizer = CountVectorizer()
actors_movie_vectorized = count_vectorizer.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized = pd.DataFrame(
    actors_movie_vectorized.toarray(),
    columns = count_vectorizer.get_feature_names_out(),
    index = actors_movie
)
# Show the vectorized movies
actors_movie_vectorized

Unnamed: 0,actors,but,like,movie,not,the
I like the movie but NOT the actors,1,1,1,1,1,2
I like the actors but NOT the movie,1,1,1,1,1,2


In [34]:
# Vectorize the sentences
tfidf_vectorizer = TfidfVectorizer()
actors_movie_vectorized = tfidf_vectorizer.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized = pd.DataFrame(
    actors_movie_vectorized.toarray(),
    columns = tfidf_vectorizer.get_feature_names_out(),
    index = actors_movie
)
# Show the vectorized movies
actors_movie_vectorized

Unnamed: 0,actors,but,like,movie,not,the
I like the movie but NOT the actors,0.333333,0.333333,0.333333,0.333333,0.333333,0.666667
I like the actors but NOT the movie,0.333333,0.333333,0.333333,0.333333,0.333333,0.666667


In [35]:
# In both CountVectorizer and TfidfVectorizer, you can specify the length of your sequences with the parameter 
# ngram_range = (min_n, max_n)

# Vectorize the sentences
count_vectorizer_n_gram = CountVectorizer(ngram_range = (2,2)) # BI-GRAMS
actors_movie_vectorized_n_gram = count_vectorizer_n_gram.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized_n_gram = pd.DataFrame(
    actors_movie_vectorized_n_gram.toarray(),
    columns = count_vectorizer_n_gram.get_feature_names_out(),
    index = actors_movie
)
# Show the vectorized movies with bigrams
actors_movie_vectorized_n_gram


Unnamed: 0,actors but,but not,like the,movie but,not the,the actors,the movie
I like the movie but NOT the actors,0,1,1,1,1,1,1
I like the actors but NOT the movie,1,1,1,0,1,1,1


In [36]:
# Vectorize the sentences
tfidf_vectorizer = TfidfVectorizer(ngram_range = (2,2))
actors_movie_vectorized = tfidf_vectorizer.fit_transform(actors_movie)

# Show the representations in a nice DataFrame
actors_movie_vectorized = pd.DataFrame(
    actors_movie_vectorized.toarray(),
    columns = tfidf_vectorizer.get_feature_names_out(),
    index = actors_movie
)
# Show the vectorized movies
actors_movie_vectorized

Unnamed: 0,actors but,but not,like the,movie but,not the,the actors,the movie
I like the movie but NOT the actors,0.0,0.378632,0.378632,0.532154,0.378632,0.378632,0.378632
I like the actors but NOT the movie,0.532154,0.378632,0.378632,0.0,0.378632,0.378632,0.378632


# (3) (Multinomial) Naive Bayes Algorithm

The Multinomial Naive Bayes algorithm is a classification algorithm based on Bayes' Theorem in probability theory

The Naive Bayes algorithm makes the strong assumption that the words in an e-mail are conditionally independent


## The E-mail Classification Problem

In [37]:
data = pd.read_csv("../data/nlp/ham_spam_emails.csv")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [38]:
data.shape

(5728, 2)

In [39]:
round(data["spam"].value_counts(normalize = True), 2)

spam
0    0.76
1    0.24
Name: proportion, dtype: float64

In [42]:
# Feature/Target
X = data["text"]
y = data["spam"]

# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

# Cross-validation
cv_results = cross_validate(pipeline_naive_bayes, X, y, cv=5, scoring=["recall"])
average_recall = cv_results["test_recall"].mean()
np.round(average_recall, 2)

np.float64(0.45)

## Tuning the Vectorizer and the Naive Bayes Algorithm Simultaneously

In [45]:
# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((1,1), (1, 2), (2,2), (1, 3)),
    'multinomialnb__alpha': (0.1,1)
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    scoring = "recall",
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(data.text,data.spam)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Score = 0.9524932488436137
Best params = {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__ngram_range': (1, 1)}


# (4) Topic Modeling and Latent Dirichlet Allocation

**Inputs**:
- Document-term matrix: documents to be converted using a vectorizer
- Number of topics: number of topics to be discovered within the documents
    - Each "topic" consists of a set of unordered words bag-of-words format
- Number of iterations LDA is an unsupervised iterative process

**Output**:
- Topics across different documents/pieces of text
    - These topics can be interpreted as "non-linear Principal Components" of the documents in the corpus

In [46]:
documents = pd.DataFrame(
    ["I like mangos and oranges", "Frogs and turtles live in ponds", "Kittens and puppies are fluffy", 
     "I had a spinach and kiwi smoothie", "My kitten loves strawberries"],
    columns = ["documents"]
)
documents

Unnamed: 0,documents
0,I like mangos and oranges
1,Frogs and turtles live in ponds
2,Kittens and puppies are fluffy
3,I had a spinach and kiwi smoothie
4,My kitten loves strawberries


In [48]:
# Cleaning the dataset
def cleaning(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers

    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in tokenized_sentence_cleaned
    ]

    cleaned_sentence = ' '.join(word for word in lemmatized)

    return cleaned_sentence

In [49]:
cleaned_documents = documents["documents"].apply(cleaning)
cleaned_documents

0         like mangos oranges
1      frog turtle live ponds
2       kitten puppies fluffy
3       spinach kiwi smoothie
4    kitten love strawberries
Name: documents, dtype: object

In [50]:
# Vectorizing
vectorizer = TfidfVectorizer()
vectorized_documents = vectorizer.fit_transform(cleaned_documents)
vectorized_documents = pd.DataFrame(
    vectorized_documents.toarray(),
    columns = vectorizer.get_feature_names_out()
)
vectorized_documents

Unnamed: 0,fluffy,frog,kitten,kiwi,like,live,love,mangos,oranges,ponds,puppies,smoothie,spinach,strawberries,turtle
0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
2,0.614189,0.0,0.495524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.614189,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0
4,0.0,0.0,0.495524,0.0,0.0,0.0,0.614189,0.0,0.0,0.0,0.0,0.0,0.0,0.614189,0.0


In [52]:
# Finding the topics
# Instantiate the LDA
n_components = 2

lda_model = LatentDirichletAllocation(n_components=n_components, max_iter=100)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_documents)

0,1,2
,n_components,2
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,100
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [53]:
# Document Mixture (of Topics)
document_topic_mixture = lda_model.transform(vectorized_documents)

document_topic_mixture

array([[0.78813743, 0.21186257],
       [0.80248147, 0.19751853],
       [0.79367021, 0.20632979],
       [0.19156771, 0.80843229],
       [0.79366831, 0.20633169]])

In [56]:
# Criar DataFrame com resultados
topic_columns = [f"topic_{i}" for i in range(n_components)]
topics_df = pd.DataFrame(document_topic_mixture, columns=topic_columns)

# Adicionar o texto original
topics_df["Original Text"] = documents["documents"]

# Adicionar uma coluna de rótulo "sentence"
topics_df.index = [f"sentence {i}" for i in range(len(documents))]

# Exibir resultado final
topics_df

Unnamed: 0,topic_0,topic_1,Original Text
sentence 0,0.788137,0.211863,I like mangos and oranges
sentence 1,0.802481,0.197519,Frogs and turtles live in ponds
sentence 2,0.79367,0.20633,Kittens and puppies are fluffy
sentence 3,0.191568,0.808432,I had a spinach and kiwi smoothie
sentence 4,0.793668,0.206332,My kitten loves strawberries


In [57]:
# Topic Mixture (of Words)
topic_word_mixture = pd.DataFrame(
    lda_model.components_,
    columns = vectorizer.get_feature_names_out()
)
topic_word_mixture

Unnamed: 0,fluffy,frog,kitten,kiwi,like,live,love,mangos,oranges,ponds,puppies,smoothie,spinach,strawberries,turtle
0,1.089751,0.976915,1.465164,0.507777,1.051149,0.976915,1.089738,1.051149,1.051149,0.976915,1.089751,0.507777,0.507777,1.089738,0.976915
1,0.524438,0.523085,0.525883,1.069573,0.526201,0.523085,0.524451,0.526201,0.526201,0.523085,0.524438,1.069573,1.069573,0.524451,0.523085


In [58]:
# What are the five most relevant words for each topic?
def print_topics(lda_model, vectorizer, top_words):
    # 1. TOPIC MIXTURE OF WORDS FOR EACH TOPIC
    topic_mixture = pd.DataFrame(
        lda_model.components_,
        columns = vectorizer.get_feature_names_out()
    )

    # 2. FINDING THE TOP WORDS FOR EACH TOPIC
    ## Number of topics
    n_components = topic_mixture.shape[0]

    ## Top words for each topic
    for topic in range(n_components):
        print("-"*10)
        print(f"For topic {topic}, here are the top {top_words}, words with weights:")
        topic_df = topic_mixture.iloc[topic].sort_values(ascending=False).head(top_words)
        print(round(topic_df,3))
        
print_topics(lda_model, vectorizer, 5)

----------
For topic 0, here are the top 5, words with weights:
kitten          1.465
fluffy          1.090
puppies         1.090
strawberries    1.090
love            1.090
Name: 0, dtype: float64
----------
For topic 1, here are the top 5, words with weights:
kiwi        1.070
smoothie    1.070
spinach     1.070
oranges     0.526
mangos      0.526
Name: 1, dtype: float64
