# **Text Processing**

In [64]:
# Import necessary libraries
import pandas as pd
import numpy as np

## **SECTION 0: Preparing Dataset for Preprocessing**

In [65]:
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [66]:
data

['When life gives you lemons, make lemonade! 🙂',
 'She bought 2 lemons for $1 at Maven Market.',
 'A dozen lemons will make a gallon of lemonade. [AllRecipes]',
 'lemon, lemon, lemons, lemon, lemon, lemons',
 "He's running to the market to get a lemon — there's a great sale today.",
 'Does Maven Market carry Eureka lemons or Meyer lemons?',
 'An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]',
 'iced tea is my favorite']

In [67]:
# converting list to dataframe
data_df = pd.DataFrame(data, columns=['Sentence'])
data_df

Unnamed: 0,Sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


*Purpose: Transforms the list into Dataframe for better analysis*

In [68]:
# Display full text in DataFrame
pd.set_option('display.max_colwidth', None)
data_df

Unnamed: 0,Sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


*Purpose: Configure pandas to display complete sentence content without truncation, which is essential for preprocessing*

## **SECTION 1: Pre-processing**

In [69]:
# Copy the original data
spacy_df = data_df.copy()
spacy_df

Unnamed: 0,Sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


*Purpose: Its helps to preserve our original data*

### **1.1 Normalization**

In [70]:
# lowercasing the text for consistency and store the results in a new column called 'clean_sentence'
spacy_df['clean_sentence'] = spacy_df['Sentence'].str.lower()
spacy_df

Unnamed: 0,Sentence,clean_sentence
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea. [wikipedia]"
7,iced tea is my favorite,iced tea is my favorite


### **1.2 Text Cleaning**

In [71]:
# Removing special characters such as references and citations
spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace('[wikipedia]', '')
spacy_df

Unnamed: 0,Sentence,clean_sentence
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea."
7,iced tea is my favorite,iced tea is my favorite


In [72]:
# Advanced cleaning using regex

combined = r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s]'

spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace(combined, ' ', regex=True)

spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace(r'\s+', ' ', regex=True).str.strip()

spacy_df

Unnamed: 0,Sentence,clean_sentence
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade allrecipes
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,he s running to the market to get a lemon there s a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


*Purpose: Use regular expressions to remove URLs, email addresses, social media handles, and non-alphanumeric characters*

### **1.3 Advanced Text Processing with spaCy**

In [73]:
import spacy

In [74]:
# Load the pre-trained pipeline

nlp = spacy.load('en_core_web_sm')

### **1.3.1 Tokenization**

**Tokenization** splits text into individual units (tokens) such as words, punctuation marks, or numbers. Modern tokenizers handle complex cases like contractions, compound words, and special characters intelligently.

In [75]:

# Create a spaCy doc object from the first sentence
doc = nlp(spacy_df.loc[0, 'clean_sentence'])

# Extract tokens as text strings
[token.text for token in doc]
# Output: ['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

# Extract tokens as spaCy objects (with linguistic attributes)
[token for token in doc]
# Output: [when, life, gives, you, lemons, make, lemonade]


[when, life, gives, you, lemons, make, lemonade]

### **1.3.2 Lemmatization**

**Lemmatization** reduces words to their base or root form (lemma) using linguistic knowledge. Unlike stemming, which simply removes suffixes, lemmatization considers the word's part of speech and meaning to find the correct root form.

In [76]:

# Extract lemmatized forms

[token.lemma_ for token in doc]

# Output: ['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']


['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

### **1.3.3 Stop Words Removal**

**Stop words** are common words that carry little semantic meaning and are often filtered out to focus on more meaningful content. Examples include "the", "and", "is", "in", etc.

In [77]:

# View all English stop words in spaCy

list(nlp.Defaults.stop_words)

print(f"Total stop words: {len(list(nlp.Defaults.stop_words))}") # 326 stop words



# Remove stop words

[token for token in doc if  not token.is_stop]

# Output: [life, gives, lemons, lemonade]



# Combine lemmatization and stop word removal

[token.lemma_ for token in doc if  not token.is_stop]

# Output: ['life', 'give', 'lemon', 'lemonade']



# Convert back to sentence format

norm = [token.lemma_ for token in doc if  not token.is_stop]

' '.join(norm) # Output: 'life give lemon lemonade'


Total stop words: 326


'life give lemon lemonade'

## **Section 2: Creating Reusable Functions**

Creating modular, reusable functions is essential for maintainable code and consistent preprocessing across different datasets.

In [78]:

# Function for lemmatization and stop word removal

def  token_lemma_stopw(text):

    doc = nlp(text)

    output = [token.lemma_ for token in doc if  not token.is_stop]

    return  ' '.join(output)



# Apply to entire dataset

spacy_df.clean_sentence.apply(token_lemma_stopw)


0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2          dozen lemon gallon lemonade allrecipe
3            lemon lemon lemon lemon lemon lemon
4          s run market lemon s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: clean_sentence, dtype: object

## **Section 3: Complete NLP Pipeline**

An **NLP pipeline** combines multiple preprocessing steps into a single, streamlined workflow. This approach ensures consistency and makes it easy to apply the same transformations to new data.

In [79]:

def  lower_replace(series):

    output = series.str.lower()

    combined = r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s]'

    output = output.str.replace(combined, ' ', regex=True)

    return output


def  nlp_pipeline(series):

    output = lower_replace(series)

    output = output.apply(token_lemma_stopw)

    return output


# Apply complete pipeline

cleaned_text = nlp_pipeline(data_df.Sentence)


# Save processed data for future use

pd.to_pickle(cleaned_text, 'preprocessed_text.pkl')


## **Section 4: Word Representation (Vectorization)**

**Vectorization** converts preprocessed text into numerical representations that machine learning algorithms can process. Text must be transformed into vectors (arrays of numbers) because algorithms cannot directly work with text strings.

### **Count Vectorization (Bag of Words)**

**Count Vectorization** creates a matrix where each row represents a document and each column represents a unique word in the corpus. Cell values indicate how many times each word appears in each document. This approach ignores word order but captures word frequency.

In [80]:

%pip install scikit-learn

# Load preprocessed data

import pandas as pd

series = pd.read_pickle('preprocessed_text.pkl')

from sklearn.feature_extraction.text import CountVectorizer

# Create Count Vectorizer

cv = CountVectorizer()
bow = cv.fit_transform(series)

# Convert to DataFrame for visualization

pd.DataFrame(bow.toarray(), columns=cv.get_feature_names_out())


Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,allrecipe,arnold,buy,carry,dozen,eureka,favorite,gallon,give,great,...,life,market,maven,meyer,palmer,run,sale,tea,today,wikipedia
0,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,1,0,1,0
5,0,0,0,1,0,1,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### **Advanced Count Vectorization**

In [81]:

# Count Vectorizer with filtering

cv1 = CountVectorizer(

stop_words='english', # Remove English stop words

ngram_range=(1,1), # Use only single words (unigrams)

min_df=2  # Include words that appear in at least 2 documents

)



bow1 = cv1.fit_transform(series)

bow1_df = pd.DataFrame(bow1.toarray(), columns=cv1.get_feature_names_out())



# Calculate term frequencies

term_freq = bow1_df.sum()


## **Section 5: TF-IDF (Term Frequency-Inverse Document Frequency)**

**TF-IDF** addresses a key limitation of simple count vectorization by considering both term frequency (how often a word appears in a document) and inverse document frequency (how rare the word is across the entire corpus).

- **Formula:**  
  $$ TF\text{-}IDF = TF \times IDF $$

- **TF (Term Frequency):**  
  \[
  TF = \frac{\text{Number of times word appears in document}}{\text{Total words in document}}
  \]

- **IDF (Inverse Document Frequency):**  
  \[
  IDF = \log \left( \frac{\text{Total documents}}{\text{Documents containing the word}} \right)
  \]

In [82]:

from sklearn.feature_extraction.text import TfidfVectorizer



# Basic TF-IDF vectorization

tv = TfidfVectorizer()

tvidf = tv.fit_transform(series)

tvidf_df = pd.DataFrame(tvidf.toarray(), columns=tv.get_feature_names_out())



# TF-IDF with filtering

tv1 = TfidfVectorizer(min_df=2) # Words must appear in at least 2 documents

tvidf1 = tv1.fit_transform(series)

tvidf1_df = pd.DataFrame(tvidf1.toarray(), columns=tv1.get_feature_names_out())


*Note:* 
- Values closer to 1 indicate highly distinctive words

- Values closer to 0 indicate either common words or absent words

### **N-gram Analysis**

In [83]:

# Bigram TF-IDF (pairs of consecutive words)

tv2 = TfidfVectorizer(ngram_range=(1,2)) # Include both unigrams and bigrams

tvidf2 = tv2.fit_transform(series)

tvidf2_df = pd.DataFrame(tvidf2.toarray(), columns=tv2.get_feature_names_out())



# Analyze feature importance

tvidf2_df.sum().sort_values(ascending=False)


lemon                 1.583310
lemon lemon           0.857624
market                0.767950
lemonade              0.743321
ice tea               0.625522
ice                   0.625522
tea                   0.625522
maven                 0.621858
maven market          0.621858
half                  0.505881
favorite              0.493436
tea favorite          0.493436
lemon maven           0.439482
buy                   0.439482
buy lemon             0.439482
give lemon            0.416207
life                  0.416207
lemon lemonade        0.416207
give                  0.416207
life give             0.416207
gallon lemonade       0.358685
dozen lemon           0.358685
allrecipe             0.358685
dozen                 0.358685
gallon                0.358685
lemonade allrecipe    0.358685
lemon gallon          0.358685
sale today            0.319884
today                 0.319884
great sale            0.319884
great                 0.319884
market lemon          0.319884
lemon gr

*Purpose:*

- Capture phrase-level information with bigrams

- Examples: "arnold palmer", "buy lemon", "ice tea"

- Preserve some context that unigrams lose