# 1. Text Preprocessing 

### --> Lower Casing

In [1]:
import pandas as pd 

df = pd.read_csv('/Users/mukund/Documents/GitHub/NLP/NLP-Begining-to-End/Dataset/IMDB Dataset.csv')
df['review'] = df['review'].str.lower()

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### --> Removing unwanted tags (HTML)

In [2]:
import re

def remove_html_tags(text):
    reg_ex = re.compile('<.*?>')
    return reg_ex.sub(r'', text)

df['review'] = df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### --> Removing URLS 

In [5]:
def remove_urls(text):
    reg_ex = re.compile(r'https?://\S+|www.\.\S+')
    return reg_ex.sub(r'', text)

df['review'] = df['review'].apply(remove_urls)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### --> Removing Punctuations

In [6]:
import string 
import time


#Slower Method
to_remove = string.punctuation
def remove_punctuations(text):
    for char in to_remove:
        text = text.replace(char, '')
    return text

#Faster Method
def remove_punctuations_fast(text):
    return text.translate(str.maketrans('','', to_remove))

# df['review'] = df['review'].apply(remove_punctuations)
df['review'] = df['review'].apply(remove_punctuations_fast)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### --> Spelling Correction

In [7]:
from textblob import TextBlob


df['review'] = df['review'].apply(lambda x : str(TextBlob(x).correct()))
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


### --> Removing Stop Words (A, the, of, are, my, etc.)

In [8]:
from nltk.corpus import stopwords

stopwords.words('english')

def remove_stowords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    X = new_text[:]
    new_text.clear()
    return " ".join(X)

df['review'] = df['review'].apply(remove_stowords)
df.head()

### --> Handling emojis - Removing

In [5]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_emoji)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


--> Adding meaning to the emojis

In [8]:
import emoji

df['review'] = df['review'].apply(emoji.demojize)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# --> Tokenization - (Process of Breaking your text document into smaller parts or tokens)

### Tokenization is done to make text classification 
#### for eg -  to classify a text if the query is Support Related or Sales Related 

1. Word Tokenization -> Original - I am an Indian
                        Word Tokenized - ['I', 'am', 'an', 'Indian']
2. Sentence Tokenization -> Original - I am an Indian. I work as a Data Scientist
                        Sentence Tokinized - ['I am an Indian', 'I work as a Data Scientist']

In [5]:
import spacy

try:
    from spacy.language import Language
    print("Language module imported successfully.")
except Exception as e:
    print("Error importing Language module:", e)

# nlp = spacy.load('en_core_web_sm')


reviews = df['reviews'].values
reviews
# df['review'] = df['review'].apply(nlp)

ValueError: 'in' is not a valid parameter name