<a href="https://colab.research.google.com/github/JamshedAli18/-Natural-Language-Processing-NLP-/blob/main/Text%20Preprocessing/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lowercasing

In [None]:
import pandas as pd

In [None]:
df_lower = pd.DataFrame({
    "review": [
        "The Quick Brown FOX Jumps Over The LAZY Dog While The SKY Turns ORANGE In The EVENING."
    ],
    "sentiment": ["neutral"]
})

In [None]:
df_lower.head()

Unnamed: 0,review,sentiment
0,The Quick Brown FOX Jumps Over The LAZY Dog Wh...,neutral


In [None]:
df_lower['review'] = df_lower['review'].str.lower()

In [None]:
df_lower.head()

Unnamed: 0,review,sentiment
0,the quick brown fox jumps over the lazy dog wh...,neutral


# Remove HTML Tags

In [None]:

text = "<html><head><title>Sample Page</title></head><body><h1>Welcome to My Page</h1><p>This is a <b>bold</b> paragraph with <a href='https://example.com'>a link</a>.</p></body></html>"


In [None]:
text

"<html><head><title>Sample Page</title></head><body><h1>Welcome to My Page</h1><p>This is a <b>bold</b> paragraph with <a href='https://example.com'>a link</a>.</p></body></html>"

In [None]:
import re

def remove_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [None]:
remove_tags(text)

'Sample PageWelcome to My PageThis is a bold paragraph with a link.'

In [None]:
df_html = pd.DataFrame({
    "review": [
        "<html><head><title>Sample Page</title></head><body><h1>Welcome to My Page</h1><p>This is a <b>bold</b> paragraph with <a href='https://example.com'>a link</a>.</p></body></html>"
    ],
    "sentiment": ["neutral"]
})


In [None]:
df_html


Unnamed: 0,review,sentiment
0,<html><head><title>Sample Page</title></head><...,neutral


In [None]:
df_html['review'] = df_html['review'].apply(remove_tags)

In [None]:
df_html

Unnamed: 0,review,sentiment
0,Sample PageWelcome to My PageThis is a bold pa...,neutral


#  Remove URLs

In [None]:
df_url = pd.DataFrame({
    "review": [
        "Visit https://www.learn-nlp.com for tutorials. Also check http://data.com/info and follow us at www.twitter.com/nlp_buddy."
    ],
    "sentiment": ["positive"]
})


In [None]:
df_url

Unnamed: 0,review,sentiment
0,Visit https://www.learn-nlp.com for tutorials....,positive


In [None]:
import re

def remove_urls(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'', text)

In [None]:
url = "Visit https://www.learn-nlp.com for tutorials. Also check http://data.com/info and follow us at www.twitter.com/nlp_buddy."

In [None]:
url

'Visit https://www.learn-nlp.com for tutorials. Also check http://data.com/info and follow us at www.twitter.com/nlp_buddy.'

In [None]:
remove_urls(url)

'Visit  for tutorials. Also check  and follow us at '

In [None]:
df_url['review'] = df_url['review'].apply(remove_urls)

In [None]:
df_url

Unnamed: 0,review,sentiment
0,Visit for tutorials. Also check and follow u...,positive


# Remove Punctuations

In [None]:
df_punct = pd.DataFrame({
    "review": [
        "Hello, world!!! It's been a long, long day; hasn't it? Well... let's get to work."
    ],
    "sentiment": ["positive"]
})


In [None]:
import string

# Remove punctuation from 'review' column
df_punct['review'] = df_punct['review'].apply(
    lambda text: text.translate(str.maketrans('', '', string.punctuation))
)



In [None]:
df_punct

Unnamed: 0,review,sentiment,clean_review
0,Hello world Its been a long long day hasnt it ...,positive,Hello world Its been a long long day hasnt it ...


 # Chat Word Treatment

In [None]:
df_chat = pd.DataFrame({
    "review": [
        "hey bro wht u doin rn? lol tbh idk if imma go out 2nite, but brb gotta eat smthin. gn 😴"
    ],
    "sentiment": ["neutral"]
})


In [None]:
text = "hey bro wht u doin rn? lol tbh idk if imma go out 2nite, but brb gotta eat smthin. gn 😴"

In [None]:
chat_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "lol": "laughing out loud",
    "brb": "be right back",
    "idk": "I don't know",
    "tbh": "to be honest",
    "imma": "I am going to",
    "rn": "right now",
    "wht": "what",
    "tho": "though",
    "smthin": "something",
    "gn": "good night",
    "rn?": "right now"
}


In [None]:
def clean_chat_words(text):
    words = text.split()
    cleaned = [chat_dict.get(word.lower(), word) for word in words]
    return " ".join(cleaned)


In [None]:
text

'hey bro wht u doin rn? lol tbh idk if imma go out 2nite, but brb gotta eat smthin. gn 😴'

In [None]:
clean_chat_words(text)

"hey bro what you doin right now laughing out loud to be honest I don't know if I am going to go out 2nite, but be right back gotta eat smthin. good night 😴"

In [None]:
df_chat['review'] = df_chat['review'].apply(clean_chat_words)

In [None]:
df_chat

Unnamed: 0,review,sentiment
0,hey bro what you doin right now laughing out l...,neutral


# Spelling Correction

In [None]:
text = "HHello Ths textt contanes severl speling erors and inaccuraacies tht ned to be corected manully or using tools."

In [None]:
from textblob import TextBlob

In [None]:
textblob = TextBlob(text)

textblob.correct().string

'hello The text contents several spelling errors and inaccuraacies the ned to be corrected manfully or using tools.'

#  Removing Stop Words

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))


In [None]:
def remove_stopwords_nltk(text):
    words = text.split()
    filtered = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered)


In [None]:
df_stopwords = pd.DataFrame({
    "review": [
        "It is often said that the best way to learn is by doing, but there are times when reading the documentation helps."
    ],
    "sentiment": ["positive"]
})


In [None]:
df_stopwords['review']

Unnamed: 0,review
0,It is often said that the best way to learn is...


In [None]:
df_stopwords['review'] = df_stopwords['review'].apply(remove_stopwords_nltk)

In [None]:
df_stopwords['review']

Unnamed: 0,review
0,"often said best way learn doing, times reading..."
