- TEXT PTREPROCESSING = "Text preprocessing involves transforming text into a clean and consistent format that can then be fed into a model for further analysis and learning"

In [1]:
import numpy as np
import pandas as pd

In [2]:
doc1 = '''Betty Botter bought some butter
But she said the butter’s bitter
If I put it in my batter, it will make my batter bitter
But a bit of better butter will make my batter better
So ‘twas better Betty Botter bought a bit of better butter'''
''''''
doc2= '''Susie works in a shoeshine shop. Where she shines she sits, and where she sits she shines.'''

doc3 = '''I have got a date at a quarter to eight; I’ll see you at the gate, so don’t be late.'''
doc4 = '''You know New York, you need New York, you know you need unique New York.'''

In [3]:
dfdata = pd.DataFrame({"docs" : [doc1, doc2, doc3, doc4]})

In [4]:
dfdata

Unnamed: 0,docs
0,Betty Botter bought some butter\nBut she said ...
1,Susie works in a shoeshine shop. Where she shi...
2,I have got a date at a quarter to eight; I’ll ...
3,"You know New York, you need New York, you know..."


In [5]:
dfdata.copy()

Unnamed: 0,docs
0,Betty Botter bought some butter\nBut she said ...
1,Susie works in a shoeshine shop. Where she shi...
2,I have got a date at a quarter to eight; I’ll ...
3,"You know New York, you need New York, you know..."


In [6]:
dfdata["docs"].str.lower()

0    betty botter bought some butter\nbut she said ...
1    susie works in a shoeshine shop. where she shi...
2    i have got a date at a quarter to eight; i’ll ...
3    you know new york, you need new york, you know...
Name: docs, dtype: object

In [7]:
dfdata["docs"].str.upper()

0    BETTY BOTTER BOUGHT SOME BUTTER\nBUT SHE SAID ...
1    SUSIE WORKS IN A SHOESHINE SHOP. WHERE SHE SHI...
2    I HAVE GOT A DATE AT A QUARTER TO EIGHT; I’LL ...
3    YOU KNOW NEW YORK, YOU NEED NEW YORK, YOU KNOW...
Name: docs, dtype: object

In [8]:
dfdata["docs"].str.lower()

0    betty botter bought some butter\nbut she said ...
1    susie works in a shoeshine shop. where she shi...
2    i have got a date at a quarter to eight; i’ll ...
3    you know new york, you need new york, you know...
Name: docs, dtype: object

In [9]:
dfdata["docs"].str.title()

0    Betty Botter Bought Some Butter\nBut She Said ...
1    Susie Works In A Shoeshine Shop. Where She Shi...
2    I Have Got A Date At A Quarter To Eight; I’Ll ...
3    You Know New York, You Need New York, You Know...
Name: docs, dtype: object

In [10]:
dfdata["docs"].str.capitalize()

0    Betty botter bought some butter\nbut she said ...
1    Susie works in a shoeshine shop. where she shi...
2    I have got a date at a quarter to eight; i’ll ...
3    You know new york, you need new york, you know...
Name: docs, dtype: object

In [11]:
dfdata["docs"][3].lower()

'you know new york, you need new york, you know you need unique new york.'

In [12]:
dfdata["docs"][2].upper()

'I HAVE GOT A DATE AT A QUARTER TO EIGHT; I’LL SEE YOU AT THE GATE, SO DON’T BE LATE.'

## CONVERTING TO UNIFORM CASE

In [13]:
def upperconversion_case(x):
    return x.str.upper()

In [14]:
dfdata["docs"]= upperconversion_case(dfdata["docs"])

In [15]:
dfdata

Unnamed: 0,docs
0,BETTY BOTTER BOUGHT SOME BUTTER\nBUT SHE SAID ...
1,SUSIE WORKS IN A SHOESHINE SHOP. WHERE SHE SHI...
2,I HAVE GOT A DATE AT A QUARTER TO EIGHT; I’LL ...
3,"YOU KNOW NEW YORK, YOU NEED NEW YORK, YOU KNOW..."


In [16]:
def lowerconversion_case(x):
    return x.str.lower()

In [17]:
dfdata["docs"]= lowerconversion_case(dfdata["docs"])

In [18]:
dfdata

Unnamed: 0,docs
0,betty botter bought some butter\nbut she said ...
1,susie works in a shoeshine shop. where she shi...
2,i have got a date at a quarter to eight; i’ll ...
3,"you know new york, you need new york, you know..."


In [19]:
def titleconversion_case(x):
    return x.str.title()

In [20]:
dfdata["docs"] = titleconversion_case(dfdata["docs"])

In [21]:
dfdata

Unnamed: 0,docs
0,Betty Botter Bought Some Butter\nBut She Said ...
1,Susie Works In A Shoeshine Shop. Where She Shi...
2,I Have Got A Date At A Quarter To Eight; I’Ll ...
3,"You Know New York, You Need New York, You Know..."


In [22]:
def capitalizeconversion_case(x):
    return x.str.capitalize()

In [23]:
dfdata["docs"]= capitalizeconversion_case(dfdata["docs"])

In [24]:
dfdata

Unnamed: 0,docs
0,Betty botter bought some butter\nbut she said ...
1,Susie works in a shoeshine shop. where she shi...
2,I have got a date at a quarter to eight; i’ll ...
3,"You know new york, you need new york, you know..."


## HANDLING HTML TAGS

In [25]:
import regex as re 

In [26]:
x = "<b>HELLO WORLD</b>"

In [27]:
re.sub(r"<.*?>", " ", x)

' HELLO WORLD '

In [28]:
y = "<b>WELCOME TO DATA SCIENCE REALM</b>"

In [29]:
re.sub(r"<.*?>", " ", y)

' WELCOME TO DATA SCIENCE REALM '

In [30]:
def remove_tags(x):
    return re.sub(r"<.*?>", " ", x)

In [31]:
dfdata["docs"] = dfdata["docs"].apply(remove_tags)

In [32]:
dfdata

Unnamed: 0,docs
0,Betty botter bought some butter\nbut she said ...
1,Susie works in a shoeshine shop. where she shi...
2,I have got a date at a quarter to eight; i’ll ...
3,"You know new york, you need new york, you know..."


In [33]:
z = "Learn data science course well . http://wwww.usedu.com"

In [34]:
dfdata["docs"][2][0:10:1]

'I have got'

In [35]:
dfdata["docs"][3][1:25:1]

'ou know new york, you ne'

In [36]:
dfdata

Unnamed: 0,docs
0,Betty botter bought some butter\nbut she said ...
1,Susie works in a shoeshine shop. where she shi...
2,I have got a date at a quarter to eight; i’ll ...
3,"You know new york, you need new york, you know..."


In [37]:
dfdata["docs"][0]

'Betty botter bought some butter\nbut she said the butter’s bitter\nif i put it in my batter, it will make my batter bitter\nbut a bit of better butter will make my batter better\nso ‘twas better betty botter bought a bit of better butter'

In [38]:
def handlingnewline_char(x):
    return re.sub(r'\n', " ",x)

In [39]:
dfdata["docs"] = dfdata["docs"].apply(handlingnewline_char)

In [40]:
dfdata["docs"][0]

'Betty botter bought some butter but she said the butter’s bitter if i put it in my batter, it will make my batter bitter but a bit of better butter will make my batter better so ‘twas better betty botter bought a bit of better butter'

In [41]:
def handlingnewline_char(x):
    return re.sub(r'\n', ' ', x)

In [42]:
dfdata["docs"] = dfdata["docs"].apply(handlingnewline_char)

In [43]:
dfdata["docs"][0]

'Betty botter bought some butter but she said the butter’s bitter if i put it in my batter, it will make my batter bitter but a bit of better butter will make my batter better so ‘twas better betty botter bought a bit of better butter'

## HANDLING SPECIAL CHARACTER AND NUMBERS

In [44]:
U = "9372@HEY BUDDY_**9.77ILU"

In [45]:
re.sub(r'^[a-zA-z]', ' ', U)

'9372@HEY BUDDY_**9.77ILU'

In [46]:
re.sub(r'^[a-zA-Z0-9]', ' ', U)

' 372@HEY BUDDY_**9.77ILU'

In [47]:
re.sub(r"\b[a-zA-Z-0-9]+\b"," ", U)


' @  BUDDY_** . '

In [48]:
re.sub(r"\b[a-zA-Z]+\b"," ", U)

'9372@  BUDDY_**9.77ILU'

In [49]:
re.findall(r'\W+', U)

['@', ' ', '**', '.']

In [50]:
re.sub(r'[@**?]',' ', U)

'9372 HEY BUDDY_  9.77ILU'

In [51]:
re.findall(r'\W+\d+', U)

['**9', '.77']

In [52]:
re.sub(r'[@_**?]',' ', U)

'9372 HEY BUDDY   9.77ILU'

In [53]:
re.sub(r'[@_**\.?]',' ', U)

'9372 HEY BUDDY   9 77ILU'

In [54]:
re.sub(r'[@_**\d+\.?]',' ', U)

'     HEY BUDDY       ILU'

In [55]:
re.findall(r'\d+', U)

['9372', '9', '77']

In [56]:
re.findall(r"\d@\w+\s\w+",U)

['2@HEY BUDDY_']

In [57]:
re.findall(r'\b[^@_?]\w+\s\w+', U)

['HEY BUDDY_']

## REMOVING URLS

In [58]:
text1 = "check out this american univeristy link https://www.princeton.edu/admission-aid/international-students"

In [59]:
text2 = "check out this american univeristy link  https://www.princeton.edu/admission-aid/international-students"

In [60]:
def remove_url(text):
    pattern = re.compile(r'https?://S+|www\.\S+')
    return pattern.sub(r'' , text)

In [61]:
remove_url(text1)

'check out this american univeristy link https://'

In [62]:
def remove_url(text):
    pattern = re.compile(r'\b(https?://)\S+|www\.\S+\w')
    return pattern.sub(r'', text)

In [63]:
remove_url(text2)

'check out this american univeristy link  '

In [64]:
remove_url(text1)

'check out this american univeristy link '

## CHAT WORDS

In [65]:
chat_words = {
    "AFAIK" : "AS FAR AS I KNOW",
    "ASAP": "AS SSON AS POSSIBLE",
    "ATK" : "AT THE KEYBOARD",
    "ATM" : "AT THE MOMENT",
    "BAK": "BACK AT KEYBOARD",
    "BBL" :"BE BACK LATER",
    "BRT" : "BE RIGHT THERE",
    "BRB" : "BE RIGHT BACK",
    "BFN" : "BYE FOR NOW",
    "BBS" : "BE BACK SOON",
    "TTYL" : "TALK TO YOU LATER",
    "BTW" : "BY THE WAY", 
    "SUL" : "SEE YOU LATER",
    "FAQ" : "FREQUNTLY ASKED QUENTIONS",
    "FYI": "FOR YOUR INFORMATION",
    "GAL": "GET A LIFE"
}

In [66]:
chat_words

{'AFAIK': 'AS FAR AS I KNOW',
 'ASAP': 'AS SSON AS POSSIBLE',
 'ATK': 'AT THE KEYBOARD',
 'ATM': 'AT THE MOMENT',
 'BAK': 'BACK AT KEYBOARD',
 'BBL': 'BE BACK LATER',
 'BRT': 'BE RIGHT THERE',
 'BRB': 'BE RIGHT BACK',
 'BFN': 'BYE FOR NOW',
 'BBS': 'BE BACK SOON',
 'TTYL': 'TALK TO YOU LATER',
 'BTW': 'BY THE WAY',
 'SUL': 'SEE YOU LATER',
 'FAQ': 'FREQUNTLY ASKED QUENTIONS',
 'FYI': 'FOR YOUR INFORMATION',
 'GAL': 'GET A LIFE'}

In [67]:
def chat_conversation(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
            
    return " ".join(new_text)  
    

In [68]:
chat_conversation("FYI NEW YORK IS THE BEST CITY IN THE WORLD")

'FOR YOUR INFORMATION NEW YORK IS THE BEST CITY IN THE WORLD'

In [69]:
chat_conversation("BFN I'LL TALK TO YOU TOMORROW")

"BYE FOR NOW I'LL TALK TO YOU TOMORROW"

In [70]:
def chat_conversion(text):
    New_text = []
    for i in text.split():
        if i.upper() in chat_words:
            New_text.append(chat_words[i.upper()])
        else:
            New_text.append(i)

    return " ".join(New_text)        

In [71]:
chat_conversion("wait for me i'll BBS")

"wait for me i'll BE BACK SOON"

In [72]:
chat_conversion("TTYL")

'TALK TO YOU LATER'

In [73]:
dfdata["docs"] = dfdata["docs"].apply(chat_conversion)

In [74]:
dfdata["docs"][2]

'I have got a date at a quarter to eight; i’ll see you at the gate, so don’t be late.'

## TOKENIZATION

In [75]:
# TOKENIZATION IS THE PROCESS OF BREAKING OF TEXT INTO SMALLER PARTS, SMALLER PARTS MIGHT BE WORDS, SENTENCES

- USING SPLIT FUNCTION

In [76]:
sent = "I AM GOING TO AMERICA FOR JOB AND MASTER'S"

In [159]:
sent.split()

['I', 'AM', 'GOING', 'TO', 'AMERICA', 'FOR', 'JOB', 'AND', "MASTER'S"]

- SENTENCE TOKENIZATION

In [160]:
sent2 = "I AM GOING TO AMERICA FOR JOB AND MASTER'S. I WILL BE THERE FOR 5 YEARS. LETS\'S HOPE THE TRIP TO BE GREAT"

In [161]:
sent2.split(".")

["I AM GOING TO AMERICA FOR JOB AND MASTER'S",
 ' I WILL BE THERE FOR 5 YEARS',
 " LETS'S HOPE THE TRIP TO BE GREAT"]

In [162]:
sent3 = "AMERICA IS VERY BEAUTIFULL COUNTRY. IT IS MOST POWERFULL COUNTRY IN THE WORLD. NEW YORK CITY IS THE MOST BEAUTIFULL AND EXPENSIVE CITY TO LIVE IN THE WHOLE WORLD"

In [163]:
sent3.split(".")

['AMERICA IS VERY BEAUTIFULL COUNTRY',
 ' IT IS MOST POWERFULL COUNTRY IN THE WORLD',
 ' NEW YORK CITY IS THE MOST BEAUTIFULL AND EXPENSIVE CITY TO LIVE IN THE WHOLE WORLD']

In [164]:
# PROBLEMS WITH SPLIT FUNCTION
sent4= "I AM GOING TO USA"

In [165]:
sent4.split()

['I', 'AM', 'GOING', 'TO', 'USA']

In [166]:
sent5= "WHICH CITY DO YOU THINK SHOULD I VISIT IN USA?, I HAVE 3 DAYS HOLIDAY"

In [167]:
sent5.split(".")

['WHICH CITY DO YOU THINK SHOULD I VISIT IN USA?, I HAVE 3 DAYS HOLIDAY']

- REGULAR EXPRESSION


In [168]:
S = "I AM GOING TO USA FOR JOB AND MASTER'S!"

In [169]:
token = re.findall(r'[\w]+', S)

In [170]:
token

['I', 'AM', 'GOING', 'TO', 'USA', 'FOR', 'JOB', 'AND', 'MASTER', 'S']

In [171]:
s1 = " I AM GOING INDIA FOR SUMMER HOLYDAYS!"

In [172]:
token = re.findall(r'[\w+]+', s1)

In [173]:
token

['I', 'AM', 'GOING', 'INDIA', 'FOR', 'SUMMER', 'HOLYDAYS']

## NLTK

In [174]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [175]:
U = "I AM GOING TO VISIT LAS VEGAS TO SEE MY COUSIN BROTHER"

In [176]:
word_tokenize(U)

['I',
 'AM',
 'GOING',
 'TO',
 'VISIT',
 'LAS',
 'VEGAS',
 'TO',
 'SEE',
 'MY',
 'COUSIN',
 'BROTHER']

In [177]:
U1 = "I AM GOING TO WHITNEY POINT TO SEE MY FRIEND NIKHIL AND GONNA STAY AT HIS HOUSE TILL NIGHT "

In [178]:
word_tokenize(U1)

['I',
 'AM',
 'GOING',
 'TO',
 'WHITNEY',
 'POINT',
 'TO',
 'SEE',
 'MY',
 'FRIEND',
 'NIKHIL',
 'AND',
 'GON',
 'NA',
 'STAY',
 'AT',
 'HIS',
 'HOUSE',
 'TILL',
 'NIGHT']

In [179]:
U2 = "I WANT TO VISIT CHANDRAPUR TO MEET MY MOM AND A 220 KM RIDE COSTS $10"

In [180]:
word_tokenize(U2)

['I',
 'WANT',
 'TO',
 'VISIT',
 'CHANDRAPUR',
 'TO',
 'MEET',
 'MY',
 'MOM',
 'AND',
 'A',
 '220',
 'KM',
 'RIDE',
 'COSTS',
 '$',
 '10']

## STEMMING

- "the process of reducing a word to its word stem that affixes to suffixes and prefixes or the roots"
 
- Stemming is a technique used to reduce an inflected word down to its word stem. For example, the words “programming,” “programmer,” and “programs” can all be reduced down to the common word stem “program

In [181]:
from nltk.stem.porter import PorterStemmer

In [182]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [183]:
#or 

In [184]:
ps = PorterStemmer() 

In [185]:
stemmer.stem("programming")

'program'

In [186]:
stemmer.stem("playing or dancing")

'playing or danc'

In [187]:
def stem(text):
    new_text= []
    for w in text.split():
        new_text.append(stemmer.stem())
   
    return " ".join(new_text)    

In [188]:
stemmer.stem("programming")

'program'

In [189]:
stemmer.stem("walked")

'walk'

In [190]:
def stem_word(text):
    New_text = []
    for u in text.split():
        New_text.append(stemmer.stem())
    return " ".join(New_text)  

In [191]:
dfdata["docs"]= dfdata["docs"].apply(stemmer.stem)

In [192]:
dfdata["docs"][0]

"betty botter bought some butter but she said the butter's bitter if i put it in my batter, it will make my batter bitter but a bit of better butter will make my batter better so 'twas better betty botter bought a bit of better butt"

In [193]:
# OR

In [194]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [195]:
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [196]:
sample = "walks walked walking"

In [197]:
stem_words(sample)

'walk walk walk'

In [198]:
UK = "I LIKE TO PLAY CRICKET AND LIKE TO DRAW SKETCHES"

In [199]:
stem_words(UK)

'i like to play cricket and like to draw sketch'

In [200]:
UTK = "What is forecasting in Python, the Time series forecasting is a method in the statistics field to analyze historical data with a time component and create a prediction based on it. Some classic examples of time series forecasting methods are Moving Average, ARIMA, and Exponential Smoothing"

In [201]:
stem_words(UTK)

'what is forecast in python, the time seri forecast is a method in the statist field to analyz histor data with a time compon and creat a predict base on it. some classic exampl of time seri forecast method are move average, arima, and exponenti smooth'

## LEMMATIZATION

- "Lemmatization is a text pre-processing technique used in natural language processing (NLP) models to break a word down to its root meaning to identify similarities."

- NLTK (Natural Language Toolkit) is a Python library used for natural language processing. One of its modules is the WordNet Lemmatizer, which can be used to perform lemmatization on words.

- Lemmatization is the process of reducing a word to its base or dictionary form, known as the lemma.

In [202]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

In [203]:
sentence = "He was running and eating at a same time. He has bad habit of swimming after eating and plaing long hours in the sun."

# TEXT VECTORIZATION OR TEXT REPRESENTATION OR FEATURE EXTRACTION

- " Vectorization or word embedding is the process of converting text data to numerical vectors".
- Word Embeddings or Word vectorization is a methodology in NLP to map words or phrases from vocabulary to a corresponding vector of real numbers which used to find word predictions, word similarities/semanticsmm.


- CORPUS :
- what is meant by corpus : the cobination of all words in a dataset is called as corpus,
- A corpus is a collection of documents
- A corpus is a collection of authentic text or audio organized into datasets.

- VOCABULARY :
- The set of unique words used in the text corpus is referred to as the vocabulary.
- A corpus is made up of all the unique words. and If you remove all the unique words from a corpus, you will have a vocabulary or it will be a vocabulary."

- DOCUMENTS:  If we have 100 sentences, each sentence is a document. Mathematical Representation of Documents is Vector.
- 


- What is the difference between a document and a corpus in NLP?: 
- Corpus is a collection of Documents. It is a unique text different from the corpus. If we have 100 sentences, each sentence is a document. Mathematical Representation of Documents is Vecto

In [204]:
# count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [205]:
cv = CountVectorizer()

In [206]:
BOW =cv.fit_transform(dfdata["docs"])

In [207]:
#VOCABULARY
cv.vocabulary_

{'betty': 5,
 'botter': 8,
 'bought': 9,
 'some': 40,
 'butter': 12,
 'but': 10,
 'she': 34,
 'said': 32,
 'the': 42,
 'bitter': 7,
 'if': 19,
 'put': 30,
 'it': 21,
 'in': 20,
 'my': 26,
 'batter': 2,
 'will': 47,
 'make': 25,
 'bit': 6,
 'of': 29,
 'better': 4,
 'so': 39,
 'twas': 44,
 'butt': 11,
 'susie': 41,
 'works': 48,
 'shoeshine': 36,
 'shop': 37,
 'where': 46,
 'shines': 35,
 'sits': 38,
 'and': 0,
 'have': 18,
 'got': 17,
 'date': 13,
 'at': 1,
 'quarter': 31,
 'to': 43,
 'eight': 15,
 'll': 24,
 'see': 33,
 'you': 50,
 'gate': 16,
 'don': 14,
 'be': 3,
 'late': 23,
 'know': 22,
 'new': 28,
 'york': 49,
 'need': 27,
 'unique': 45}

In [208]:
BOW[1].toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 1, 1, 2, 0, 0, 1, 0, 0,
        0, 0, 2, 0, 1, 0, 0]], dtype=int64)

In [209]:
BOW[2].toarray()

array([[0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [210]:
BOW[3].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 3, 4]], dtype=int64)

In [211]:
BOW[0].toarray()

array([[0, 0, 3, 0, 4, 2, 2, 2, 2, 2, 2, 1, 3, 0, 0, 0, 0, 0, 0, 1, 1, 2,
        0, 0, 0, 2, 3, 0, 0, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 2, 0, 0, 0]], dtype=int64)

- BAG OF WORDS : "Bag of words is a Natural Language Processing technique of text modelling. In technical terms, we can say that it is a method of feature extraction with text data. This approach is a simple and flexible way of extracting features from documents."

- BI-GRAM : "A bigram language model is a type of statistical language model that predicts the probability of a word in a sequence based on the previous word."

- N-GRAM : "N-grams are defined as the contiguous sequence of n items that can be extracted from a given sample of text or speech. The items can be letters, words, or base pairs, according to the application. The N-grams typically are collected from a text or speech corpus"

- UNI-GRAMS: " A 1-gram (unigram) is a single word sequence of words like “please” or “ turn”. – A 2-gram (bigram) is a two-word sequence of words like “please turn”, “turn your”, or ”your homework”. – A 3-gram (trigram) is a three-word sequence of words like “please turn your”, or “turn your homework”."

In [212]:
utk = pd.DataFrame({"text": ["I AM READY TO GO TO AMERICA", "THE PLANE IS READY TO TAKE OFF", "THE PLANE IS ABOUT TO LAND AT THE NEW YORK AIRPORT IN JUST A FEW MINUTES "]})

In [213]:
utk

Unnamed: 0,text
0,I AM READY TO GO TO AMERICA
1,THE PLANE IS READY TO TAKE OFF
2,THE PLANE IS ABOUT TO LAND AT THE NEW YORK AIR...


In [214]:
cv = CountVectorizer()

In [215]:
unigram = cv.fit_transform(utk["text"])

In [216]:
unigram

<3x20 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [217]:
unigram.toarray()

array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 1, 1]],
      dtype=int64)

In [218]:
cv.vocabulary_

{'am': 2,
 'ready': 15,
 'to': 18,
 'go': 6,
 'america': 3,
 'the': 17,
 'plane': 14,
 'is': 8,
 'take': 16,
 'off': 13,
 'about': 0,
 'land': 10,
 'at': 4,
 'new': 12,
 'york': 19,
 'airport': 1,
 'in': 7,
 'just': 9,
 'few': 5,
 'minutes': 11}

In [219]:
print(unigram)

  (0, 2)	1
  (0, 15)	1
  (0, 18)	2
  (0, 6)	1
  (0, 3)	1
  (1, 15)	1
  (1, 18)	1
  (1, 17)	1
  (1, 14)	1
  (1, 8)	1
  (1, 16)	1
  (1, 13)	1
  (2, 18)	1
  (2, 17)	2
  (2, 14)	1
  (2, 8)	1
  (2, 0)	1
  (2, 10)	1
  (2, 4)	1
  (2, 12)	1
  (2, 19)	1
  (2, 1)	1
  (2, 7)	1
  (2, 9)	1
  (2, 5)	1
  (2, 11)	1


In [220]:
cv = CountVectorizer(ngram_range=(2,2))

In [221]:
bigram=cv.fit_transform(utk["text"])

In [222]:
bigram

<3x22 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [223]:
bigram.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1]],
      dtype=int64)

In [224]:
cv.vocabulary_

{'am ready': 2,
 'ready to': 13,
 'to go': 18,
 'go to': 5,
 'to america': 17,
 'the plane': 16,
 'plane is': 12,
 'is ready': 8,
 'to take': 20,
 'take off': 14,
 'is about': 7,
 'about to': 0,
 'to land': 19,
 'land at': 10,
 'at the': 3,
 'the new': 15,
 'new york': 11,
 'york airport': 21,
 'airport in': 1,
 'in just': 6,
 'just few': 9,
 'few minutes': 4}

In [225]:
print(bigram)

  (0, 2)	1
  (0, 13)	1
  (0, 18)	1
  (0, 5)	1
  (0, 17)	1
  (1, 13)	1
  (1, 16)	1
  (1, 12)	1
  (1, 8)	1
  (1, 20)	1
  (1, 14)	1
  (2, 16)	1
  (2, 12)	1
  (2, 7)	1
  (2, 0)	1
  (2, 19)	1
  (2, 10)	1
  (2, 3)	1
  (2, 15)	1
  (2, 11)	1
  (2, 21)	1
  (2, 1)	1
  (2, 6)	1
  (2, 9)	1
  (2, 4)	1


In [226]:
cv = CountVectorizer(ngram_range=(3,3))

In [227]:
trigram= cv.fit_transform(utk["text"])

In [228]:
trigram

<3x21 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [229]:
trigram.toarray()

array([[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0],
       [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1]],
      dtype=int64)

In [230]:
cv.vocabulary_

{'am ready to': 2,
 'ready to go': 13,
 'to go to': 17,
 'go to america': 4,
 'the plane is': 16,
 'plane is ready': 12,
 'is ready to': 7,
 'ready to take': 14,
 'to take off': 19,
 'plane is about': 11,
 'is about to': 6,
 'about to land': 0,
 'to land at': 18,
 'land at the': 9,
 'at the new': 3,
 'the new york': 15,
 'new york airport': 10,
 'york airport in': 20,
 'airport in just': 1,
 'in just few': 5,
 'just few minutes': 8}

In [231]:
print(trigram)

  (0, 2)	1
  (0, 13)	1
  (0, 17)	1
  (0, 4)	1
  (1, 16)	1
  (1, 12)	1
  (1, 7)	1
  (1, 14)	1
  (1, 19)	1
  (2, 16)	1
  (2, 11)	1
  (2, 6)	1
  (2, 0)	1
  (2, 18)	1
  (2, 9)	1
  (2, 3)	1
  (2, 15)	1
  (2, 10)	1
  (2, 20)	1
  (2, 1)	1
  (2, 5)	1
  (2, 8)	1


## TF-IDF VECTORIZER

- "TF-IDF is technique in Natural Language Processing for converting words in Vectors and with some semantic information and it gives weighted to uncommon words"
- "TF-IDF is useful in many natural language processing applications. For example, Search Engines use TF-IDF to rank the relevance of a document for a query. TF-IDF is also employed in text classification, text summarization, and topic modeling."

In [232]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [233]:
tfidfvectorizer = TfidfVectorizer()

In [234]:
utk

Unnamed: 0,text
0,I AM READY TO GO TO AMERICA
1,THE PLANE IS READY TO TAKE OFF
2,THE PLANE IS ABOUT TO LAND AT THE NEW YORK AIR...


In [235]:
tfidfvectorizer.fit_transform(utk["text"]).toarray()

array([[0.        , 0.        , 0.44839402, 0.44839402, 0.        ,
        0.        , 0.44839402, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.34101521, 0.        , 0.        , 0.52965746, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.35221512, 0.        ,
        0.        , 0.        , 0.        , 0.46312056, 0.35221512,
        0.35221512, 0.46312056, 0.35221512, 0.27352646, 0.        ],
       [0.26900365, 0.26900365, 0.        , 0.        , 0.26900365,
        0.26900365, 0.        , 0.26900365, 0.20458421, 0.26900365,
        0.26900365, 0.26900365, 0.26900365, 0.        , 0.20458421,
        0.        , 0.        , 0.40916842, 0.15887789, 0.26900365]])

In [236]:
print(tfidfvectorizer.idf_)


[1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.28768207 1.69314718 1.28768207
 1.         1.69314718]


In [237]:
print(tfidfvectorizer_feature_names_out())

NameError: name 'tfidfvectorizer_feature_names_out' is not defined

## DEEP LEARNING APPROACHES

- "Deep learning is a machine learning technique that teaches computers to do what comes naturally to humans: learn by example. Deep learning is a key technology behind driverless cars, enabling them to recognize a stop sign, or to distinguish a pedestrian from a lamppost."

# WORD2VEC 

- Word2Vec can capture symantic meaning like ex:- Happy and Joy it can recognise that i have same meaning
- Low have dimension so computational can be fast 
- Dense Vectors

- CBOW:- 

In [238]:
from gensim.models import Word2Vec, KeyedVectors

In [241]:
wrd2vecembedding = Word2Vec(list(utk["text"].str.split()), vector_size = 10, min_count= 1)

In [242]:
wrd2vecembedding 

<gensim.models.word2vec.Word2Vec at 0x1f451491510>

In [243]:
type(wrd2vecembedding )

gensim.models.word2vec.Word2Vec

In [244]:
print(dir(wrd2vecembedding ))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_corpus_sanity', '_check_training_sanity', '_clear_post_train', '_do_train_epoch', '_do_train_job', '_get_next_alpha', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_raw_word_count', '_save_specials', '_scan_vocab', '_smart_save', '_train_epoch', '_train_epoch_corpusfile', '_worker_loop', '_worker_loop_corpusfile', 'add_lifecycle_event', 'add_null_word', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'cbow_mean', 'comment', 'compute_loss', 'corpus_count', 'corpus_total_words', 'create_binary_tree'

In [245]:
print(dir(wrd2vecembedding.wv))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_mean_vector', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index',

In [247]:
wrd2vecembedding.wv.__getitem__(wrd2vecembedding.wv.index_to_key) # every word's vector representation

array([[-0.00539552,  0.00232393,  0.05100326,  0.0899847 , -0.09300875,
        -0.07111891,  0.06463242,  0.08981007, -0.05018915, -0.03759257],
       [ 0.07381411, -0.01535229, -0.04539502,  0.06555288, -0.04860752,
        -0.01814488,  0.0287871 ,  0.00994161, -0.08287872, -0.0945075 ],
       [ 0.07311766,  0.05070262,  0.06757693,  0.00762866,  0.06350891,
        -0.03405366, -0.00946401,  0.05768573, -0.07521638, -0.03936104],
       [-0.07511376, -0.00934256,  0.09540984, -0.07337879, -0.02327469,
        -0.01933269,  0.08083545, -0.05925278,  0.00047569, -0.04742986],
       [-0.09604025,  0.05006899, -0.08761217, -0.0439361 , -0.0003551 ,
        -0.00295094, -0.07660891,  0.09616972,  0.04980387,  0.09233835],
       [-0.08155638,  0.04493742, -0.04138051,  0.00817817,  0.08500696,
        -0.04459886,  0.04518904, -0.06782202, -0.03547909,  0.09402988],
       [-0.01577862,  0.00308955, -0.04150177, -0.07691965, -0.01502094,
         0.02482391, -0.00877448,  0.05550457

In [248]:
len(wrd2vecembedding.wv.__getitem__(wrd2vecembedding.wv.index_to_key))

22

In [265]:
print(wrd2vecembedding.wv.key_to_index)

{'TO': 0, 'THE': 1, 'READY': 2, 'PLANE': 3, 'IS': 4, 'MINUTES': 5, 'FEW': 6, 'AM': 7, 'GO': 8, 'AMERICA': 9, 'TAKE': 10, 'OFF': 11, 'ABOUT': 12, 'LAND': 13, 'AT': 14, 'NEW': 15, 'YORK': 16, 'AIRPORT': 17, 'IN': 18, 'JUST': 19, 'A': 20, 'I': 21}


In [266]:
wrd2vecembedding.wv.__getitem__(wrd2vecembedding.wv.index_to_key)[0]

array([-0.00539552,  0.00232393,  0.05100326,  0.0899847 , -0.09300875,
       -0.07111891,  0.06463242,  0.08981007, -0.05018915, -0.03759257],
      dtype=float32)

In [270]:
wrd2vecembedding.wv.__getitem__(wrd2vecembedding.wv.index_to_key)[2]

array([ 0.07311766,  0.05070262,  0.06757693,  0.00762866,  0.06350891,
       -0.03405366, -0.00946401,  0.05768573, -0.07521638, -0.03936104],
      dtype=float32)

In [261]:
print(wrd2vecembedding.wv.index_to_key)

['TO', 'THE', 'READY', 'PLANE', 'IS', 'MINUTES', 'FEW', 'AM', 'GO', 'AMERICA', 'TAKE', 'OFF', 'ABOUT', 'LAND', 'AT', 'NEW', 'YORK', 'AIRPORT', 'IN', 'JUST', 'A', 'I']


In [276]:
len(utk["text"][2].split())

16

In [277]:
uk = list(utk["text"][2].split())

In [278]:
uk

['THE',
 'PLANE',
 'IS',
 'ABOUT',
 'TO',
 'LAND',
 'AT',
 'THE',
 'NEW',
 'YORK',
 'AIRPORT',
 'IN',
 'JUST',
 'A',
 'FEW',
 'MINUTES']

In [279]:
wrd2vecembedding.wv.__getitem__(uk)

array([[ 0.07381411, -0.01535229, -0.04539502,  0.06555288, -0.04860752,
        -0.01814488,  0.0287871 ,  0.00994161, -0.08287872, -0.0945075 ],
       [-0.07511376, -0.00934256,  0.09540984, -0.07337879, -0.02327469,
        -0.01933269,  0.08083545, -0.05925278,  0.00047569, -0.04742986],
       [-0.09604025,  0.05006899, -0.08761217, -0.0439361 , -0.0003551 ,
        -0.00295094, -0.07660891,  0.09616972,  0.04980387,  0.09233835],
       [ 0.02350009, -0.04529419,  0.08384062, -0.09869492,  0.0676968 ,
         0.02924688, -0.04921033,  0.04412061, -0.01740559,  0.06719152],
       [-0.00539552,  0.00232393,  0.05100326,  0.0899847 , -0.09300875,
        -0.07111891,  0.06463242,  0.08981007, -0.05018915, -0.03759257],
       [ 0.0996748 , -0.04366238, -0.00601846, -0.05696345,  0.03852775,
         0.02790027,  0.06895097,  0.06106038,  0.09539854,  0.09276399],
       [ 0.07896214, -0.06992096, -0.09156968, -0.00365155, -0.03097867,
         0.07897475,  0.05942691, -0.01540702

In [281]:
wrd2vecembedding.wv.__getitem__(uk).mean(axis= 0)

array([-0.00314053, -0.01223376, -0.00749892, -0.01451802,  0.00072714,
        0.00852797,  0.0222466 ,  0.00690646, -0.01051649,  0.01745043],
      dtype=float32)