# Text Preprocessing

1. Lowercasing
2. Remove HTML Tags => using regex
3. Remove URLs
4. Remove Punctuation
5. Chat word treatment => change ASAP to (As Soon As Possible)
6. Spelling Correction
7. Remove Stop words
8. Handling Emojis => Replace emoji with meaning
9. Tokenization
10. Stemming => Bringing word to its root word
11. Lemmatization


### 4. Remove Punctuation

In [1]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [2]:
exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [3]:
text = "string. with@ Punctuation!"

text = remove_punc(text)
text

'string with Punctuation'

# 5. Chat word Treatment

In [4]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek You (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That Feeling When",
    "MFW": "My Face When",
    "MRW": "My Reaction When",
    "IFYP": "I Feel Your Pain",
    "TNTL": "Trying Not To Laugh",
    "JK": "Just Kidding",
    "IDC": "I Don’t Care",
    "ILY": "I Love You",
    "IMU": "I Miss You",
    "ADIH": "Another Day In Hell",
    "ZZZ": "Sleeping, Bored, Tired",
    "WYWH": "Wish You Were Here",
    "TIME": "Tears In My Eyes",
    "BAE": "Before Anyone Else",
    "FIMH": "Forever In My Heart",
    "BSAAW": "Big Smile And A Wink",
    "BWL": "Bursting With Laughter",
    "BFF": "Best Friends Forever",
    "CSL": "Can’t Stop Laughing"
}


In [None]:
def chat_conversation(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return " ".join(new_text)

In [6]:
chat_conversation("IMHO Karachi is best")

'In My Honest/Humble Opinion Karachi is best'

# 6. Spelling Correction

In [9]:
from textblob import TextBlob
incorrect = "tooday is veryya wunderfull dai"
textblb = TextBlob(incorrect)

textblb.correct().string

'today is very wonderful day'

## 7. Handling Emojis

In [8]:
import re

text = u'This is a smiley face \U0001f602'
print(text) # with emoji

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

print(deEmojify(text))

This is a smiley face 😂
This is a smiley face 


# 9. Tokenization

## Spacy

In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [11]:
sent1 ="I have a Ph.D in A.I"
sent2 = "We're here to help! mail us at nltk@gmail.com"
sent3 = "A 5km ride cost $10.50"

In [None]:
print(nlp(sent1))
print(nlp(sent2))
print(nlp(sent3))

# 10. Stemming

In [12]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [13]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

# 11. Lemmatization

In [21]:
import nltk
nltk.download('punkt_tab', download_dir='/nltk_temp_data')

[nltk_data] Downloading package punkt_tab to /nltk_temp_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [19]:
import nltk
nltk.data.path.append('/nltk_temp_data')

from nltk.corpus import wordnet
print(wordnet.synsets('happy'))


[Synset('happy.a.01'), Synset('felicitous.s.02'), Synset('glad.s.02'), Synset('happy.s.04')]


In [22]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at the same time. He has bad habit of swimming after playing long hours in the Sun"
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
the                 the                 
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


# Assignment

1) Create dataset:
       * Fetch movie and genre data from 2 different apis.
       * Conctenate both dataset
2) Perform Text Preprocessing

In [1]:
import requests
import pandas as pd
from pandas import json_normalize
from tqdm import tqdm

In [2]:
movie_df = pd.DataFrame({})
for i in tqdm(range(0, 450), desc="Fetching movies data"):
    url = f"https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={i+1}"
    data = requests.get(url)
        
    available_data = data.json()
    df = json_normalize(available_data['results'])
    movie_df = pd.concat([movie_df, df])

len(movie_df)

Fetching movies data: 100%|██████████| 450/450 [02:50<00:00,  2.63it/s]


9000

In [3]:
movie_df = movie_df.reset_index()

In [4]:
genre_url = "https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"
data = requests.get(genre_url)
available_data = data.json()
genre_df = json_normalize(available_data["genres"])
genre_df

Unnamed: 0,id,name
0,28,Action
1,12,Adventure
2,16,Animation
3,35,Comedy
4,80,Crime
5,99,Documentary
6,18,Drama
7,10751,Family
8,14,Fantasy
9,36,History


In [5]:
tqdm.pandas()
movie_df["genre"] = movie_df["genre_ids"].progress_apply(
    lambda value: " ".join([genre_df[genre_df["id"] == i]["name"].values[0] for i in value])
)

100%|██████████| 9000/9000 [00:16<00:00, 531.50it/s]


In [6]:
df = movie_df[["original_title", "overview", "genre"]].copy()

In [7]:
df.head()["overview"][0]

'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.'