In [82]:
import pandas as pd
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [83]:
doc_1 = '''<b>Betty Botter bought a bit of butter</b>.
The butter Betty Botter bought was a bit bitter
And made her batter bitter.
But a bit of better butter makes better batter.
So Betty Botter bought a bit of better butter
Making Betty Botter’s bitter batter better'''


doc_2 = '''1234567890 Whether the weather be fine,
Or whether the weather be not,
Whether the weather be cold,
Or whether the weather be hot,
We’ll weather the weather,
Whatever the weather,
Whether we like it or not.'''


doc_3 = '''To sit in solemn silence in a dull, dark dock,
In a pestilential prison, with a life-long lock,
Awaiting the sensation of a short, sharp shock,
From a cheap and chippy chopper on a big black block!
To sit in solemn silence in a dull, dark dock,
In a pestilential prison, with a life-long lock,
Awaiting the sensation of a short, sharp shock,
From a cheap and chippy chopper on a big black block!
A dull, dark dock, a life-long lock,
A short, sharp shock, a big black block!
To sit in solemn silence in a pestilential prison,
And awaiting the sensation
From a cheap and chippy chopper on a big black block!
https://speakingtongue.com/long-tongue-twisters/'''


doc_4 = '''A tree toad loved a she-toad
Who lived up in a tree.
He was a two-toed tree toad
But a three-toed toad was she.
The two-toed tree toad tried to win
The three-toed she-toad’s heart,
For the two-toed tree toad loved the ground
That the three-toed tree toad trod.
But the two-toed tree toad tried in vain.
He couldn’t please her whim.
From her tree toad bower
With her three-toed power
The she-toad vetoed him.'''

doc_5 = '''Yellow butter, purple jelly, red jam, black bread.
Spread it thick, say it quick!
Yellow butter, purple jelly, red jam, black bread.
Spread it thicker, say it quicker!
Yellow butter, purple jelly, red jam, black bread.
Don’t eat with your mouth full!'''

In [84]:
df_org = pd.DataFrame({'docs' : [doc_1 , doc_2 , doc_3 , doc_4 , doc_5]})

In [85]:
df = df_org.copy()

In [86]:
df

Unnamed: 0,docs
0,<b>Betty Botter bought a bit of butter</b>.\nT...
1,"1234567890 Whether the weather be fine,\nOr wh..."
2,"To sit in solemn silence in a dull, dark dock,..."
3,A tree toad loved a she-toad\nWho lived up in ...
4,"Yellow butter, purple jelly, red jam, black br..."


In [87]:
def cleaning_text(data , stem = False):
    
    # Removing htnl tags
    html_tag = re.sub(r'<.*?>' , '' , data)
    
    # Removing special char and --> number are depends upon data
    spcl_chr = re.sub(r'[^A-z ]','',html_tag)
    
    # handling newline char(\n)
    newline_tag = re.sub(r'\n',' ',spcl_chr)
    
    # Converting uniform case --> preferble (lower)
    sentns = newline_tag.lower()
    
    # spliting the data
    tokens = sentns.split()
    
    # Removing stopwords
    clean_tokens = [t for t in tokens if t not in stopwords.words('english')]
    
    # stemming or lemmatize
    if stem:
        doc_list = [SnowballStemmer('english').stem(i) for i in clean_tokens]
    else:
        doc_list = [WordNetLemmatizer().lemmatize(i) for i in clean_tokens]
    
    return ' '.join(doc_list)

In [88]:
df['Text_cleaned_lemmatizer'] = df['docs'].apply(cleaning_text)

In [89]:
df

Unnamed: 0,docs,Text_cleaned_lemmatizer
0,<b>Betty Botter bought a bit of butter</b>.\nT...,betty botter bought bit butterthe butter betty...
1,"1234567890 Whether the weather be fine,\nOr wh...",whether weather fineor whether weather notwhet...
2,"To sit in solemn silence in a dull, dark dock,...",sit solemn silence dull dark dockin pestilenti...
3,A tree toad loved a she-toad\nWho lived up in ...,tree toad loved shetoadwho lived treehe twotoe...
4,"Yellow butter, purple jelly, red jam, black br...",yellow butter purple jelly red jam black bread...


In [90]:
df['Text_cleaned_stemmer'] = df['docs'].apply(cleaning_text,stem = True)

In [91]:
df

Unnamed: 0,docs,Text_cleaned_lemmatizer,Text_cleaned_stemmer
0,<b>Betty Botter bought a bit of butter</b>.\nT...,betty botter bought bit butterthe butter betty...,betti botter bought bit butterth butter betti ...
1,"1234567890 Whether the weather be fine,\nOr wh...",whether weather fineor whether weather notwhet...,whether weather fineor whether weather notwhet...
2,"To sit in solemn silence in a dull, dark dock,...",sit solemn silence dull dark dockin pestilenti...,sit solemn silenc dull dark dockin pestilenti ...
3,A tree toad loved a she-toad\nWho lived up in ...,tree toad loved shetoadwho lived treehe twotoe...,tree toad love shetoadwho live treeh twoto tre...
4,"Yellow butter, purple jelly, red jam, black br...",yellow butter purple jelly red jam black bread...,yellow butter purpl jelli red jam black breads...
