##                                                           Data Preprocessing

* Step 1 : Cleaning
* Step 2 : Tokenization
* Step 3 : Stop words removal
* Step 4 : Lemmatization
* Step 5 : Stemming

In [14]:
import warnings
warnings.filterwarnings("ignore")

##### Reading Data

In [2]:
import pandas as pd

# Sample DataFrame

data = {'text': [
    "Check out my Comment in this link: https://example.com",
    "<p>Running is fun!</p>",
    "The cats are sitting on the mats.",
    "I'm Fine :>",
    "Felling Worried :("
]}

df = pd.DataFrame(data)
df

Unnamed: 0,text
0,Check out my Comment in this link: https://exa...
1,<p>Running is fun!</p>
2,The cats are sitting on the mats.
3,I'm Fine :>
4,Felling Worried :(


# Step 1 : Cleaning

##### 1.1 Convert to lowercase

In [3]:
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,check out my comment in this link: https://exa...
1,<p>Running is fun!</p>,<p>running is fun!</p>
2,The cats are sitting on the mats.,the cats are sitting on the mats.
3,I'm Fine :>,i'm fine :>
4,Felling Worried :(,felling worried :(


##### 1.2 Removing URLs

In [4]:
import re

def clean_text(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

df['clean_text'] = df['clean_text'].apply(clean_text)

df.head()

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,check out my comment in this link:
1,<p>Running is fun!</p>,<p>running is fun!</p>
2,The cats are sitting on the mats.,the cats are sitting on the mats.
3,I'm Fine :>,i'm fine :>
4,Felling Worried :(,felling worried :(


##### 1.3 Removing HTML Tags

In [5]:
from bs4 import BeautifulSoup

def clean_text(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Apply the cleaning function to the 'tweet' column
df['clean_text'] = df['clean_text'].apply(clean_text)

df

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,check out my comment in this link:
1,<p>Running is fun!</p>,running is fun!
2,The cats are sitting on the mats.,the cats are sitting on the mats.
3,I'm Fine :>,i'm fine :>
4,Felling Worried :(,felling worried :(


##### 1.4 Remove unwanted characters

* Remove the punctuations , symbols etc to reduce noise in the data

In [6]:
import re

def clean_text(tweet):
    return re.sub(r'[^A-Za-z0-9\s]', '', tweet)

df['clean_text'] = df['clean_text'].apply(clean_text)

df

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,check out my comment in this link
1,<p>Running is fun!</p>,running is fun
2,The cats are sitting on the mats.,the cats are sitting on the mats
3,I'm Fine :>,im fine
4,Felling Worried :(,felling worried


# Step 2 : Tokenization

It splits the text into individual words or tokens

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
def tokenize(text):
    return word_tokenize(text)

df['clean_text'] = df['clean_text'].apply(tokenize)
df

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,"[check, out, my, comment, in, this, link]"
1,<p>Running is fun!</p>,"[running, is, fun]"
2,The cats are sitting on the mats.,"[the, cats, are, sitting, on, the, mats]"
3,I'm Fine :>,"[im, fine]"
4,Felling Worried :(,"[felling, worried]"


# Step 3 : Removal of Stop words

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
print(stop_words)

{'when', 'y', 'we', 'what', "doesn't", 'were', 'had', 'while', 'once', 'do', "she's", 'the', 'they', 'about', 'myself', 'was', 'all', 'our', 'her', 'is', 'be', 'after', "aren't", 'both', "that'll", 'you', "mustn't", 'same', 'did', 'then', 'more', "hasn't", 'their', "couldn't", 'with', "you'll", 'll', 'o', 'mightn', 're', "mightn't", 'a', 'can', 'hers', "didn't", 'but', 'just', 'yourself', 'weren', 'does', 'until', 'shan', 'by', 'why', 'above', 'mustn', 'now', 'm', "wouldn't", 'on', "needn't", 'haven', 'hasn', 'ourselves', "you've", 'am', 'his', 'yours', 'me', 'some', "weren't", 'there', 'shouldn', 'don', 'because', 'before', 'few', 'ain', 'yourselves', 'during', 'to', 'i', 'needn', "you're", 'who', 'nor', 'down', 'for', 'not', 'its', 'from', 'my', 'as', 'which', 'whom', 've', 'she', 'ours', 'through', 'out', 'being', 'into', "you'd", 'up', 'these', 'd', 'most', 'your', 'where', 'couldn', 'over', 'so', "haven't", 'if', 'himself', 'of', 'hadn', 'below', 'them', 'too', "should've", 'doesn

In [10]:
def remove_stop_words(text):
    return [word for word in text if word not in stop_words]

df['clean_text'] = df['clean_text'].apply(remove_stop_words)

df

Unnamed: 0,text,clean_text
0,Check out my Comment in this link: https://exa...,"[check, comment, link]"
1,<p>Running is fun!</p>,"[running, fun]"
2,The cats are sitting on the mats.,"[cats, sitting, mats]"
3,I'm Fine :>,"[im, fine]"
4,Felling Worried :(,"[felling, worried]"


# Step 4 : Stemming

It converts the words to its root form, ex : running becomes run

In [11]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

df['Stemmed_text'] = df['clean_text'].apply(stemming)
    
df  

Unnamed: 0,text,clean_text,Stemmed_text
0,Check out my Comment in this link: https://exa...,"[check, comment, link]","[check, comment, link]"
1,<p>Running is fun!</p>,"[running, fun]","[run, fun]"
2,The cats are sitting on the mats.,"[cats, sitting, mats]","[cat, sit, mat]"
3,I'm Fine :>,"[im, fine]","[im, fine]"
4,Felling Worried :(,"[felling, worried]","[fell, worri]"


# Step 5 : Lemmatization

In [15]:
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jemim\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN,"V" : wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize(tokens):
    # POS tagging for the tokens
    pos_tokens = pos_tag(tokens)
    
    # Lemmatize each token with its corresponding POS tag
    return [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tokens]

# Apply lemmatization to the DataFrame
df['lemmatized_text'] = df['clean_text'].apply(lemmatize)

df

Unnamed: 0,text,clean_text,Stemmed_text,lemmatized_text
0,Check out my Comment in this link: https://exa...,"[check, comment, link]","[check, comment, link]","[check, comment, link]"
1,<p>Running is fun!</p>,"[running, fun]","[run, fun]","[run, fun]"
2,The cats are sitting on the mats.,"[cats, sitting, mats]","[cat, sit, mat]","[cat, sit, mat]"
3,I'm Fine :>,"[im, fine]","[im, fine]","[im, fine]"
4,Felling Worried :(,"[felling, worried]","[fell, worri]","[fell, worry]"
