In [1]:
import pandas as pd
import numpy as np
import re
import polars
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import contractions
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Simple EDA

In [2]:
data = {
    "Review_ID": range(1, 21),
    "Review_Text": [
        "The PRODUCT quality is EXCELLENT!!! üòç Bought on 12-01-2024 at 10:30AM",
        "Worst experience EVER!!! üò° totally disappointed... visit http://badreviews.com",
        "The service was OKAY, nothing special #average",
        "I LOVE this product <b>Highly Recommended</b> üëçüëç",
        "Very poor customer support @supportteam, delayed delivery üòû",
        "<p>Average performance</p> could be BETTER!!!",
        "Absolutely FANTASTIC!!! üòçüî• #HappyCustomer",
        "Not worth the money I paid 5000‚Çπ üò†",
        "The app works fine BUT sometimes crashes @devteam",
        "Great VALUE for money üí∞ Fast delivery on 2024-05-10",
        "Terrible packaging!!! damaged product üò°üò°",
        "The product does WHAT it claims <xml>verified</xml>",
        "Amazing experience üòä Will BUY again!!!",
        "Completely USELESS and frustrating üò§üò§",
        "Neither good NOR bad, just okay... ü§∑",
        "Customer service was VERY helpful & polite @helpdesk",
        "Bad quality material!!! broke within 3 days üòû",
        "The design is decent BUT functionality is LIMITED #design",
        "Extremely satisfied with the purchase üòç #Satisfied",
        "Regret buying this üò° Very disappointing!!! OrderID: 12345"
    ],
    "Sentiment": [
        "Positive", "Negative", "Neutral", "Positive", "Negative",
        "Neutral", "Positive", "Negative", "Neutral", "Positive",
        "Negative", "Neutral", "Positive", "Negative", "Neutral",
        "Positive", "Negative", "Neutral", "Positive", "Negative"
    ],
    "Email_ID": [
        "vamshi.kumar@gmail.com",
        "rockstar_99@yahoo.com",
        "data.learner@gmail.com",
        "ml.engineer@outlook.com",
        "support.user@yahoo.com",
        "analytics.pro@gmail.com",
        "happy.customer@gmail.com",
        "budget.buyer@yahoo.com",
        "app.tester@outlook.com",
        "fast.delivery@gmail.com",
        "angry.user@yahoo.com",
        "verified.buyer@outlook.com",
        "repeat.customer@gmail.com",
        "frustrated.user@yahoo.com",
        "neutral.reviewer@gmail.com",
        "polite.customer@outlook.com",
        "quality.issue@yahoo.com",
        "design.reviewer@gmail.com",
        "satisfied.user@outlook.com",
        "disappointed.buyer@yahoo.com"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Review_ID,Review_Text,Sentiment,Email_ID
0,1,The PRODUCT quality is EXCELLENT!!! üòç Bought o...,Positive,vamshi.kumar@gmail.com
1,2,Worst experience EVER!!! üò° totally disappointe...,Negative,rockstar_99@yahoo.com
2,3,"The service was OKAY, nothing special #average",Neutral,data.learner@gmail.com
3,4,I LOVE this product <b>Highly Recommended</b> üëçüëç,Positive,ml.engineer@outlook.com
4,5,"Very poor customer support @supportteam, delay...",Negative,support.user@yahoo.com
5,6,<p>Average performance</p> could be BETTER!!!,Neutral,analytics.pro@gmail.com
6,7,Absolutely FANTASTIC!!! üòçüî• #HappyCustomer,Positive,happy.customer@gmail.com
7,8,Not worth the money I paid 5000‚Çπ üò†,Negative,budget.buyer@yahoo.com
8,9,The app works fine BUT sometimes crashes @devteam,Neutral,app.tester@outlook.com
9,10,Great VALUE for money üí∞ Fast delivery on 2024-...,Positive,fast.delivery@gmail.com


# Check whether the text is in lower or upper or (lower and upper) 

In [3]:
# check  whether the data is in upper or lower
df['Review_Text'].apply(lambda x: True if x.islower() or x.isupper() else False).sum()

np.int64(0)

In [4]:
# check whether the data is a combination of upper and lower
df['Review_Text'].apply(lambda x: False if x.islower() or x.isupper() else True).sum()

np.int64(20)

# Check whether the text contains any xml or html tags

In [5]:
df['Review_Text'].apply(lambda x:True if re.search(r'<.*?>',x) else False).sum()

np.int64(3)

# Check whether the text contains urls

In [6]:
df['Review_Text'].apply(lambda x:True if re.search(r'http[s]?://\S+',x) else False).sum()

np.int64(1)

# Check whether the text data contains any email ids

In [7]:
df['Email_ID'].apply(lambda x: True if re.search(r'\S+@\S+',x) else False).sum()

np.int64(20)

# Check whether the text data contains any mentions/hastags

In [8]:
df['Review_Text'].apply(lambda x: True if re.search(r'\B[@#]\S+',x) else False).sum()

np.int64(7)

# Check whether the text data contains any emojis

In [9]:
import emoji

In [10]:
text="Absolutely FANTASTIC!!! üòçüî• #HappyCustomer"

In [11]:
emoji.emoji_count(text)

2

In [12]:
emoji.demojize(text)

'Absolutely FANTASTIC!!! :smiling_face_with_heart-eyes::fire: #HappyCustomer'

In [13]:
df['Review_Text'].apply(lambda x: True if emoji.emoji_count(x) else False).sum()

np.int64(14)

# Check whether the text data contains any numbers

In [14]:
df['Review_Text'].apply(lambda x: True if re.search('\d',x) else False).sum()

np.int64(5)

# Check whether the text data contains any punctuations

In [15]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
df['Review_Text'].apply(lambda x: True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x)else False).sum()

np.int64(18)

# Check whether the text data contains any date

In [17]:
df['Review_Text'].apply(lambda x: True if re.findall(r'\d{1,2}-\d{1,2}-\d{4}', x) else False).sum()

np.int64(1)

# Simple Eda function

In [18]:
def simple_eda(data,column):
    import emoji
    lower_upper=data[column].apply(lambda x: False if x.islower() or x.isupper() else True).sum()
    html_xml=data[column].apply(lambda x:True if re.search(r'<.*?>',x) else False).sum()
    urls=data[column].apply(lambda x:True if re.search(r'http[s]?://\S+',x) else False).sum()
    emails=data[column].apply(lambda x: True if re.search(r'\S+@\S+',x) else False).sum()
    mentions_hastags=data[column].apply(lambda x: True if re.search(r'\B[@#]\S+',x) else False).sum()
    emo=data[column].apply(lambda x: True if emoji.emoji_count(x) else False).sum()
    digits=data[column].apply(lambda x: True if re.search('\d',x) else False).sum()
    punctuations=data[column].apply(lambda x: True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x)else False).sum()
    date=data[column].apply(lambda x: True if re.findall(r'\d{1,2}-\d{1,2}-\d{4}', x) else False).sum()
    missing_values=data[column].isna().sum()
    duplicates=data[column].duplicated().sum()

    if lower_upper>0:
        print(f'The {column} contains the both lower and upper cases')
    if html_xml>0:
        print(f'The {column} contains the xml or html tags')
    if urls>0:
        print(f'The {column} contains urls')
    if emails>0:
        print(f'The {column} contains emails')
    if mentions_hastags>0:
        print(f'The {column} contains mentions and hastags')
    if emo>0:
        print(f'The {column} contains emojis')
    if digits>0:
        print(f'The {column} contains digits')
    if punctuations>0:
        print(f'The {column} contains Punctuations')
    if date>0:
        print(f'The {column} contains date')
    if missing_values>0:
        print(f'The {column} has {missing_values} missing values')
    if duplicates>0:
        print(f'The {column} has {duplicates} duplicate values')
    

In [19]:
data=pd.read_csv(r"C:\Users\masir\Downloads\Machine Learning\NLP\train (2).csv",on_bad_lines='skip',delimiter=';')
data

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization‚Äôs Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0
...,...,...,...,...
24348,24348,Mexico Senate committee OK's air transport dea...,MEXICO CITY (Reuters) - A key committee in Mex...,1
24349,24349,BREAKING: HILLARY CLINTON‚ÄôS STATE DEPARTMENT G...,IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR...,0
24350,24350,trump breaks from stump speech to admire beaut...,kremlin nato was created for agression \nruss...,0
24351,24351,NFL PLAYER Delivers Courageous Message: Stop B...,Dallas Cowboys star wide receiver Dez Bryant t...,0


In [None]:
simple_eda(data,'text')

# Data preprocessing

# Convert data into lower case or upper case (based on problem statment)

In [66]:
data=pd.DataFrame({'Reviews':['I LOVE BIRYANI','I Love Biryani','i love biryani']})

In [67]:
data

Unnamed: 0,Reviews
0,I LOVE BIRYANI
1,I Love Biryani
2,i love biryani


In [68]:
data['Reviews'].str.lower()

0    i love biryani
1    i love biryani
2    i love biryani
Name: Reviews, dtype: str

In [69]:
data['Reviews'].str.upper()

0    I LOVE BIRYANI
1    I LOVE BIRYANI
2    I LOVE BIRYANI
Name: Reviews, dtype: str

# remove the tags

In [73]:
data=pd.DataFrame({'Reviews':['<p>I LOVE BIRYANI</p>','<span><p>I Love Biryani</p></span>','<div>i love biryani</div>']})

In [74]:
data

Unnamed: 0,Reviews
0,<p>I LOVE BIRYANI</p>
1,<span><p>I Love Biryani</p></span>
2,<div>i love biryani</div>


In [75]:
data['Reviews'].apply(lambda x: re.sub(r'<.*?>',' ',x))

0       I LOVE BIRYANI 
1      I Love Biryani  
2       i love biryani 
Name: Reviews, dtype: str

# remove any urls

In [79]:
data = pd.DataFrame({
    'Reviews': [
        'I LOVE BIRYANI i have order from https://swiggy',
        'I Love Biryani i have orderd from https://Zomato',
        'i love biryani i have orderd from http://pizza'
    ]
})

data


Unnamed: 0,Reviews
0,I LOVE BIRYANI i have order from https://swiggy
1,I Love Biryani i have orderd from https://Zomato
2,i love biryani i have orderd from http://pizza


In [80]:
data['Reviews'].apply(lambda x:re.sub(r'https?://\S+','',x))

0     I LOVE BIRYANI i have order from 
1    I Love Biryani i have orderd from 
2    i love biryani i have orderd from 
Name: Reviews, dtype: str

In [90]:
data = pd.DataFrame({
    'Reviews': [
        'I LOVE BIRYANI i have order from https:\\swiggy contact me at user1@gmail.com',
        'I Love Biryani i have orderd from https:\\ mail id is foodie@yahoo.com',
        'i love biryani i have orderd from http:\\pizza reach me at test_user@outlook.com'
    ]
})

data


Unnamed: 0,Reviews
0,I LOVE BIRYANI i have order from https:\swiggy...
1,I Love Biryani i have orderd from https:\ mail...
2,i love biryani i have orderd from http:\pizza ...


In [91]:
data['Reviews']=data['Reviews'].apply(lambda x:re.sub(r'\S+@+\S+','',x))

In [92]:
data.loc[0,'Reviews']

'I LOVE BIRYANI i have order from https:\\swiggy contact me at '

# remove mentions and hashtags

In [93]:
data = pd.DataFrame({
    'Reviews': [
        'I love biryani  @swiggy_support #food',
        'I LOVE biryani  @zomato_care #biryani',
        'i love biryani  @pizza_help #hungry'
    ]
})
data

Unnamed: 0,Reviews
0,I love biryani @swiggy_support #food
1,I LOVE biryani @zomato_care #biryani
2,i love biryani @pizza_help #hungry


In [94]:
data['Reviews'].apply(lambda x: re.sub(r'\B[@#]\S+','',x))

0    I love biryani   
1    I LOVE biryani   
2    i love biryani   
Name: Reviews, dtype: str

# convert Images

In [96]:
import emoji

In [95]:

df = pd.DataFrame({
    "Review_Text": [
        "I love this product üòçüî• totally worth it!",
        "Worst experience ever üò°üëé very disappointed",
        "It is okay üôÇ nothing special but works fine",
        "Amazing quality and fast delivery üöÄüòä",
        "Not happy with the service üòûüò§ will not recommend"
    ]
})

df


Unnamed: 0,Review_Text
0,I love this product üòçüî• totally worth it!
1,Worst experience ever üò°üëé very disappointed
2,It is okay üôÇ nothing special but works fine
3,Amazing quality and fast delivery üöÄüòä
4,Not happy with the service üòûüò§ will not recommend


In [101]:
df['Review_Text'].apply(lambda x:emoji.demojize(x,delimiters=('','')))

0    I love this product smiling_face_with_heart-ey...
1    Worst experience ever enraged_facethumbs_down ...
2    It is okay slightly_smiling_face nothing speci...
3    Amazing quality and fast delivery rocketsmilin...
4    Not happy with the service disappointed_facefa...
Name: Review_Text, dtype: str

# Stopwords

In [2]:
from nltk.corpus import stopwords

In [3]:
stopwords.fileids()

['albanian',
 'arabic',
 'azerbaijani',
 'basque',
 'belarusian',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'tamil',
 'turkish',
 'uzbek']

In [4]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [5]:
len(stopwords.words('english'))

198

# Sentence Tokenization & Word 

In [8]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [7]:
text = "I love biryani. It is very tasty! Do you like it?"
sent_tokenize(text)

['I love biryani.', 'It is very tasty!', 'Do you like it?']

In [9]:
text = "I love biryani! It is very tasty üòä"
word_tokenize(text)

['I', 'love', 'biryani', '!', 'It', 'is', 'very', 'tasty', 'üòä']

# Stopwords Removal

In [11]:
data = pd.DataFrame({
    "Review": [
        "I love biryani and it was delicious",
        "The biryani was not very good",
        "I did not like the taste of the food",
        "This is the best biryani that I have eaten",
        "It was an average meal and it was okay"
    ]
})

data1

Unnamed: 0,Review
0,I love biryani and it was delicious
1,The biryani was not very good
2,I did not like the taste of the food
3,This is the best biryani that I have eaten
4,It was an average meal and it was okay


In [19]:
stopwordslist=stopwords.words('english')
stopwordslist.remove('not')

In [23]:
for i,doc in enumerate(data['Review']):
    doc1=[]
    for token in word_tokenize(doc):
        if token.lower() not in stopwordslist:
            doc1.append(token)
    data.loc[i,'Review']=' '.join(doc1)

In [24]:
data

Unnamed: 0,Review
0,love biryani delicious
1,biryani not good
2,not like taste food
3,best biryani eaten
4,average meal okay


# Contractions

In [26]:
import contractions

In [28]:
text = "I can't believe it's not good"
contractions.fix(text)

'I cannot believe it is not good'

In [29]:
df = pd.DataFrame({
    "Review": [
        "I can't believe the food isn't good",
        "I'm happy with the service but it hasn't arrived",
        "They've delivered late and I don't like it",
        "It's okay but I won't order again",
        "We didn't enjoy the meal and it's overpriced"
    ]
})

df

Unnamed: 0,Review
0,I can't believe the food isn't good
1,I'm happy with the service but it hasn't arrived
2,They've delivered late and I don't like it
3,It's okay but I won't order again
4,We didn't enjoy the meal and it's overpriced


In [30]:
df['Review'].apply(lambda x: contractions.fix(x))

0                I cannot believe the food is not good
1    I am happy with the service but it has not arr...
2        They have delivered late and I do not like it
3                It is okay but I will not order again
4       We did not enjoy the meal and it is overpriced
Name: Review, dtype: str

# acronyms of chats

In [31]:
df = pd.DataFrame({
    "Chat_Text": [
        "LOL that biryani was amazing üòÇ",
        "BRB ordering food rn",
        "IDK why the delivery is late",
        "OMG this app is so slow",
        "FYI the order was canceled smh"
    ]
})

df


Unnamed: 0,Chat_Text
0,LOL that biryani was amazing üòÇ
1,BRB ordering food rn
2,IDK why the delivery is late
3,OMG this app is so slow
4,FYI the order was canceled smh


In [32]:
acronyms = {
    "LOL": "laugh out loud",
    "BRB": "be right back",
    "RN": "right now",
    "IDK": "i do not know",
    "OMG": "oh my god",
    "FYI": "for your information",
    "SMH": "shaking my head"
}

In [36]:
for i,doc in enumerate(df['Chat_Text']):
    doc1=[]
    for token in word_tokenize(doc):
        if token in acronyms:
            doc1.append(acronyms[token])
        else:
            doc1.append(token)
    df.loc[i,'Chat_Text']=' '.join(doc1)

In [37]:
df

Unnamed: 0,Chat_Text
0,laugh out loud that biryani was amazing üòÇ
1,be right back ordering food rn
2,i do not know why the delivery is late
3,oh my god this app is so slow
4,for your information the order was canceled smh


# Stemming

In [46]:
data = {
    "Sentence": [
        "The children are playing in the gardens.",
        "She studies harder than her brothers.",
        "They were running towards the buses.",
        "He played better matches last year.",
        "The cats were chasing the mice."
    ]
}

df = pd.DataFrame(data)

print(df)


                                   Sentence
0  The children are playing in the gardens.
1     She studies harder than her brothers.
2      They were running towards the buses.
3       He played better matches last year.
4           The cats were chasing the mice.


# PorterStemmer

In [47]:
from nltk.stem import PorterStemmer

In [48]:
ps=PorterStemmer()

In [49]:
ps.stem('Programmer')

'programm'

In [50]:
ps.stem('running')

'run'

In [51]:
ps.stem('Programming')

'program'

In [52]:
ps.stem('Happily')

'happili'

In [53]:
for i,doc in enumerate(df['Sentence']):
    doc1=[]
    for token in word_tokenize(doc):
        doc1.append(ps.stem(token))
    df.loc[i,'Sentence']=' '.join(doc1)

In [54]:
df

Unnamed: 0,Sentence
0,the children are play in the garden .
1,she studi harder than her brother .
2,they were run toward the buse .
3,he play better match last year .
4,the cat were chase the mice .


# SnowballStemmer

In [11]:
from nltk.stem import SnowballStemmer

In [13]:
ss=SnowballStemmer(language='english')

In [14]:
ss.stem('Programmer')

'programm'

In [15]:
ss.stem('running')

'run'

In [16]:
ss.stem('Programming')

'program'

In [17]:
ss.stem('Happily')

'happili'

In [56]:
data = {
    "Sentence": [
        "The children are playing in the gardens.",
        "She studies harder than her brothers.",
        "They were running towards the buses.",
        "He played better matches last year.",
        "The cats were chasing the mice."
    ]
}

df = pd.DataFrame(data)
print(df)
for i,doc in enumerate(df['Sentence']):
    doc1=[]
    for token in word_tokenize(doc):
        doc1.append(ss.stem(token))
    df.loc[i,'Sentence']=' '.join(doc1)
print(df)

                                   Sentence
0  The children are playing in the gardens.
1     She studies harder than her brothers.
2      They were running towards the buses.
3       He played better matches last year.
4           The cats were chasing the mice.
                                Sentence
0  the children are play in the garden .
1    she studi harder than her brother .
2        they were run toward the buse .
3       he play better match last year .
4          the cat were chase the mice .


# LancasterStemmer

In [19]:
from nltk.stem import LancasterStemmer

In [20]:
ls=LancasterStemmer()

In [21]:
ls.stem('Programmer')

'program'

In [22]:
ls.stem('Running')

'run'

In [23]:
ls.stem('Programming')

'program'

In [24]:
ls.stem('Happily')

'happy'

In [57]:
data = {
    "Sentence": [
        "The children are playing in the gardens.",
        "She studies harder than her brothers.",
        "They were running towards the buses.",
        "He played better matches last year.",
        "The cats were chasing the mice."
    ]
}

df = pd.DataFrame(data)
print(df)
for i,doc in enumerate(df['Sentence']):
    doc1=[]
    for token in word_tokenize(doc):
        doc1.append(ls.stem(token))
    df.loc[i,'Sentence']=' '.join(doc1)
print(df) 

                                   Sentence
0  The children are playing in the gardens.
1     She studies harder than her brothers.
2      They were running towards the buses.
3       He played better matches last year.
4           The cats were chasing the mice.
                           Sentence
0  the childr ar play in the gard .
1   she study hard than her broth .
2     they wer run toward the bus .
3     he play bet match last year .
4        the cat wer chas the mic .


# Lematization

In [3]:
from nltk.stem import WordNetLemmatizer

In [4]:
wl=WordNetLemmatizer()

In [5]:
wl.lemmatize('cats') 

'cat'

In [6]:
wl.lemmatize('running',pos='n')

'running'

In [7]:
wl.lemmatize('better',pos='n')

'better'

In [35]:
wl.lemmatize('faithfully')

'faithfully'

In [58]:
data = {
    "Sentence": [
        "The children are playing in the gardens.",
        "She studies harder than her brothers.",
        "They were running towards the buses.",
        "He played better matches last year.",
        "The cats were chasing the mice."
    ]
}

df = pd.DataFrame(data)
print(df)
for i,doc in enumerate(df['Sentence']):
    doc1=[]
    for token in word_tokenize(doc):
        doc1.append(wl.lemmatize(token))
    df.loc[i,'Sentence']=' '.join(doc1)
print(df) 

                                   Sentence
0  The children are playing in the gardens.
1     She studies harder than her brothers.
2      They were running towards the buses.
3       He played better matches last year.
4           The cats were chasing the mice.
                                Sentence
0  The child are playing in the garden .
1    She study harder than her brother .
2    They were running towards the bus .
3     He played better match last year .
4       The cat were chasing the mouse .


# Data Preprocessing Function

In [48]:
def pre_processing(data,column,case='lower',tags=True,urls=True,mentions_hashtags=True,digits=True,dates=True,Contractions=True,stop_wordss=True,inflections='stem',stems='porter',emoji=True,emails=True,punctuations=True):
    import emoji
    import contractions
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    import pandas
    from nltk.stem import PorterStemmer,SnowballStemmer,LancasterStemmer,WordNetLemmatizer
    import re
    ps=PorterStemmer()
    ss=SnowballStemmer('english')
    ls=LancasterStemmer()
    wl=WordNetLemmatizer()
    
    stop_words=stopwords.words('english')
    stop_words.remove('not')

    

    if emoji==True:
        data[column]=data[column].apply(lambda x:emoji.demojize(x,delimiters=('','')))
    
    if case=='lower':
        data[column]=data[column].str.lower()
    elif case == 'upper':
        data[column]=data[column].str.upper()
    else:
        pass

    if tags==True:
        data[column]=data[column].apply(lambda x: re.sub(r'<.*?>',' ',x))

    if urls==True:
        data[column]=data[column].apply(lambda x:re.sub(r'https?://\S*','',x))

    if mentions_hashtags==True:
        data[column]=data[column].apply(lambda x: re.sub(r'\B[@#]\S+','',x))
        
    if dates==True:
        data[column]=data[column].apply(lambda x: re.sub(r'\d{1,2}-\d{1,2}-\d{4}',' ', x))
        data[column]=data[column].apply(lambda x: re.sub(r'\d{1,2}/\d{1,2}/\d{4}',' ', x))

    if digits==True:
        data[column]=data[column].apply(lambda x: re.sub(r'\d','',x))

    if Contractions==True:
        data[column]=data[column].apply(lambda x: contractions.fix(x))


    if emails==True:
        data[column]=data[column].apply(lambda x: re.sub(r'\S+@\S+','',x))

    def stop(data,column):
        l2=[]
        for doc in data[column]:
            l1=[]
            for token in word_tokenize(doc):
                if token.lower() not in stop_words:
                    l1.append(token)
            l2.append(' '.join(l1))
        return l2
    if stop_wordss==True:
        data[column]=stop(data,column)


    if inflections=='stem':
        for i,doc in enumerate(data[column]):
            l1=[]
            for token in word_tokenize(doc):
                if stems=='ps':
                    word=ps.stem(token)
                    l1.append(word)
                elif stems=='ss':
                    word=ss.stem(token)
                    l1.append(word)
                elif stems=='ls':
                    word=wl.stem(token)
                    l1.append(word) 
            data.loc[i,column]=' '.join(l1)
    elif inflections=='lemma':
        for i,doc in enumerate(data[column]):
            l1=[]
            for token in word_tokenize(doc):
                word=wl.lemmatize(token)
            l1.append(word)
            data.loc[i,column]=' '.join(l1)

    if punctuations==True:
        data[column]=data[column].apply(lambda x: re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]','',x))
    
    return data

In [242]:
data1=pd.read_csv(r"C:\Users\masir\Downloads\Machine Learning\NLP\train (2).csv",on_bad_lines='skip',delimiter=';')

In [243]:
data1.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization‚Äôs Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [248]:
simple_eda(data1,'text')

The text contains the both lower and upper cases
The text contains the xml or html tags
The text contains urls
The text contains emails
The text contains mentions and hastags
The text contains emojis
The text contains digits
The text contains Punctuations
The text contains date
The text has 1 duplicate values


In [249]:
pre_processing(data1,'text')

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,,1
1,1,China says Trump call with Taiwan president wo...,,1
2,2,FAIL! The Trump Organization‚Äôs Credit Score W...,,0
3,3,Zimbabwe military chief's China trip was norma...,,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,,0
...,...,...,...,...
24348,24348,Mexico Senate committee OK's air transport dea...,,1
24349,24349,BREAKING: HILLARY CLINTON‚ÄôS STATE DEPARTMENT G...,,0
24350,24350,trump breaks from stump speech to admire beaut...,,0
24351,24351,NFL PLAYER Delivers Courageous Message: Stop B...,,0


# Categorical Encoding

# Nominal Encoding

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
df=sns.load_dataset('tips')

In [20]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [21]:
df.sex.unique()

['Female', 'Male']
Categories (2, str): ['Male', 'Female']

In [22]:
ohe=OneHotEncoder()
ohe 

0,1,2
,"categories  categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith  column. The passed categories should not mix strings and numeric  values within a single feature, and should be sorted in case of  numeric values. The used categories can be found in the ``categories_`` attribute. .. versionadded:: 0.20",'auto'
,"drop  drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into an unregularized linear regression model. However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one  category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two  categories. Features with 1 or more than 2 categories are  left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that  should be dropped. When `max_categories` or `min_frequency` is configured to group infrequent categories, the dropping behavior is handled after the grouping. .. versionadded:: 0.21  The parameter `drop` was added in 0.21. .. versionchanged:: 0.23  The option `drop='if_binary'` was added in 0.23. .. versionchanged:: 1.1  Support for dropping infrequent categories.",
,"sparse_output  sparse_output: bool, default=True When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, i.e. a sparse matrix in ""Compressed Sparse Row"" (CSR) format. .. versionadded:: 1.2  `sparse` was renamed to `sparse_output`",True
,"dtype  dtype: number type, default=np.float64 Desired dtype of output.",<class 'numpy.float64'>
,"handle_unknown  handle_unknown: {'error', 'ignore', 'infrequent_if_exist', 'warn'}, default='error' Specifies the way unknown categories are handled during :meth:`transform`. - 'error' : Raise an error if an unknown category is present during transform. - 'ignore' : When an unknown category is encountered during  transform, the resulting one-hot encoded columns for this feature  will be all zeros. In the inverse transform, an unknown category  will be denoted as None. - 'infrequent_if_exist' : When an unknown category is encountered  during transform, the resulting one-hot encoded columns for this  feature will map to the infrequent category if it exists. The  infrequent category will be mapped to the last position in the  encoding. During inverse transform, an unknown category will be  mapped to the category denoted `'infrequent'` if it exists. If the  `'infrequent'` category does not exist, then :meth:`transform` and  :meth:`inverse_transform` will handle an unknown category as with  `handle_unknown='ignore'`. Infrequent categories exist based on  `min_frequency` and `max_categories`. Read more in the  :ref:`User Guide `. - 'warn' : When an unknown category is encountered during transform  a warning is issued, and the encoding then proceeds as described for  `handle_unknown=""infrequent_if_exist""`. .. versionchanged:: 1.1  `'infrequent_if_exist'` was added to automatically handle unknown  categories and infrequent categories. .. versionadded:: 1.6  The option `""warn""` was added in 1.6.",'error'
,"min_frequency  min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered  infrequent. - If `float`, categories with a smaller cardinality than  `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"max_categories  max_categories: int, default=None Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"feature_name_combiner  feature_name_combiner: ""concat"" or callable, default=""concat"" Callable with signature `def callable(input_feature, category)` that returns a string. This is used to create feature names to be returned by :meth:`get_feature_names_out`. `""concat""` concatenates encoded feature name and category with `feature + ""_"" + str(category)`.E.g. feature X with values 1, 6, 7 create feature names `X_1, X_6, X_7`. .. versionadded:: 1.3",'concat'


In [23]:
ohe.fit_transform(df.sex.to_frame())

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 244 stored elements and shape (244, 2)>

In [24]:
ohe1=OneHotEncoder(sparse_output=False)

In [25]:
ohe1.fit_transform(df[['sex']])

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [26]:
ohe2=OneHotEncoder(sparse_output=False).set_output(transform='pandas')

In [27]:
df2=ohe2.fit_transform(df[['sex']])
df2

Unnamed: 0,sex_Female,sex_Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [28]:
pd.concat([df,df2],axis=1) 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0


In [29]:
ohe3=OneHotEncoder(sparse_output=False,drop='first').set_output(transform='pandas')
ohe3

0,1,2
,"categories  categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith  column. The passed categories should not mix strings and numeric  values within a single feature, and should be sorted in case of  numeric values. The used categories can be found in the ``categories_`` attribute. .. versionadded:: 0.20",'auto'
,"drop  drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None Specifies a methodology to use to drop one of the categories per feature. This is useful in situations where perfectly collinear features cause problems, such as when feeding the resulting data into an unregularized linear regression model. However, dropping one category breaks the symmetry of the original representation and can therefore induce a bias in downstream models, for instance for penalized linear classification or regression models. - None : retain all features (the default). - 'first' : drop the first category in each feature. If only one  category is present, the feature will be dropped entirely. - 'if_binary' : drop the first category in each feature with two  categories. Features with 1 or more than 2 categories are  left intact. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that  should be dropped. When `max_categories` or `min_frequency` is configured to group infrequent categories, the dropping behavior is handled after the grouping. .. versionadded:: 0.21  The parameter `drop` was added in 0.21. .. versionchanged:: 0.23  The option `drop='if_binary'` was added in 0.23. .. versionchanged:: 1.1  Support for dropping infrequent categories.",'first'
,"sparse_output  sparse_output: bool, default=True When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, i.e. a sparse matrix in ""Compressed Sparse Row"" (CSR) format. .. versionadded:: 1.2  `sparse` was renamed to `sparse_output`",False
,"dtype  dtype: number type, default=np.float64 Desired dtype of output.",<class 'numpy.float64'>
,"handle_unknown  handle_unknown: {'error', 'ignore', 'infrequent_if_exist', 'warn'}, default='error' Specifies the way unknown categories are handled during :meth:`transform`. - 'error' : Raise an error if an unknown category is present during transform. - 'ignore' : When an unknown category is encountered during  transform, the resulting one-hot encoded columns for this feature  will be all zeros. In the inverse transform, an unknown category  will be denoted as None. - 'infrequent_if_exist' : When an unknown category is encountered  during transform, the resulting one-hot encoded columns for this  feature will map to the infrequent category if it exists. The  infrequent category will be mapped to the last position in the  encoding. During inverse transform, an unknown category will be  mapped to the category denoted `'infrequent'` if it exists. If the  `'infrequent'` category does not exist, then :meth:`transform` and  :meth:`inverse_transform` will handle an unknown category as with  `handle_unknown='ignore'`. Infrequent categories exist based on  `min_frequency` and `max_categories`. Read more in the  :ref:`User Guide `. - 'warn' : When an unknown category is encountered during transform  a warning is issued, and the encoding then proceeds as described for  `handle_unknown=""infrequent_if_exist""`. .. versionchanged:: 1.1  `'infrequent_if_exist'` was added to automatically handle unknown  categories and infrequent categories. .. versionadded:: 1.6  The option `""warn""` was added in 1.6.",'error'
,"min_frequency  min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered  infrequent. - If `float`, categories with a smaller cardinality than  `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"max_categories  max_categories: int, default=None Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. .. versionadded:: 1.1  Read more in the :ref:`User Guide `.",
,"feature_name_combiner  feature_name_combiner: ""concat"" or callable, default=""concat"" Callable with signature `def callable(input_feature, category)` that returns a string. This is used to create feature names to be returned by :meth:`get_feature_names_out`. `""concat""` concatenates encoded feature name and category with `feature + ""_"" + str(category)`.E.g. feature X with values 1, 6, 7 create feature names `X_1, X_6, X_7`. .. versionadded:: 1.3",'concat'


In [30]:
ohe3.fit_transform(df[['sex']])

Unnamed: 0,sex_Male
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0
...,...
239,1.0
240,0.0
241,1.0
242,1.0


# Training Data and testing Data

In [31]:
x_train=pd.DataFrame({'Days':['Mon','Tue','Wed','Thur','Fri','Sat']})
x_train

Unnamed: 0,Days
0,Mon
1,Tue
2,Wed
3,Thur
4,Fri
5,Sat


In [32]:
ohe=OneHotEncoder(sparse_output=False,handle_unknown='ignore').set_output(transform='pandas')

In [33]:
ohe.fit_transform(x_train.Days.to_frame())

Unnamed: 0,Days_Fri,Days_Mon,Days_Sat,Days_Thur,Days_Tue,Days_Wed
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0


In [34]:
test_data=pd.DataFrame({'Days':['Mon','Hyderabad','Tue','Delhi']})
test_data

Unnamed: 0,Days
0,Mon
1,Hyderabad
2,Tue
3,Delhi


In [35]:
ohe.transform(test_data[['Days']])

Unnamed: 0,Days_Fri,Days_Mon,Days_Sat,Days_Thur,Days_Tue,Days_Wed
0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0


# Ordinal Encoding

In [36]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [37]:
def size_cat(x):
    if x>4:
        return 'Large'
    elif x>3:
        return 'Medium'
    else:
        return 'Small'

In [38]:
df['size_category']=df['size'].apply(lambda x: size_cat(x))

In [39]:
df['size_category'].unique()

<StringArray>
['Small', 'Medium', 'Large']
Length: 3, dtype: str

In [40]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,size_category
0,16.99,1.01,Female,No,Sun,Dinner,2,Small
1,10.34,1.66,Male,No,Sun,Dinner,3,Small
2,21.01,3.5,Male,No,Sun,Dinner,3,Small
3,23.68,3.31,Male,No,Sun,Dinner,2,Small
4,24.59,3.61,Female,No,Sun,Dinner,4,Medium


In [41]:
from sklearn.preprocessing import OrdinalEncoder

In [42]:
OE=OrdinalEncoder(categories=[['Small','Medium','Large']]).set_output(transform='pandas')
OE

0,1,2
,"categories  categories: 'auto' or a list of array-like, default='auto' Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith  column. The passed categories should not mix strings and numeric  values, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute.","[['Small', 'Medium', ...]]"
,"dtype  dtype: number type, default=np.float64 Desired dtype of output.",<class 'numpy.float64'>
,"handle_unknown  handle_unknown: {'error', 'use_encoded_value'}, default='error' When set to 'error' an error will be raised in case an unknown categorical feature is present during transform. When set to 'use_encoded_value', the encoded value of unknown categories will be set to the value given for the parameter `unknown_value`. In :meth:`inverse_transform`, an unknown category will be denoted as None. .. versionadded:: 0.24",'error'
,"unknown_value  unknown_value: int or np.nan, default=None When the parameter handle_unknown is set to 'use_encoded_value', this parameter is required and will set the encoded value of unknown categories. It has to be distinct from the values used to encode any of the categories in `fit`. If set to np.nan, the `dtype` parameter must be a float dtype. .. versionadded:: 0.24",
,"encoded_missing_value  encoded_missing_value: int or np.nan, default=np.nan Encoded value of missing categories. If set to `np.nan`, then the `dtype` parameter must be a float dtype. .. versionadded:: 1.1",
,"min_frequency  min_frequency: int or float, default=None Specifies the minimum frequency below which a category will be considered infrequent. - If `int`, categories with a smaller cardinality will be considered  infrequent. - If `float`, categories with a smaller cardinality than  `min_frequency * n_samples` will be considered infrequent. .. versionadded:: 1.3  Read more in the :ref:`User Guide `.",
,"max_categories  max_categories: int, default=None Specifies an upper limit to the number of output categories for each input feature when considering infrequent categories. If there are infrequent categories, `max_categories` includes the category representing the infrequent categories along with the frequent categories. If `None`, there is no limit to the number of output features. `max_categories` do **not** take into account missing or unknown categories. Setting `unknown_value` or `encoded_missing_value` to an integer will increase the number of unique integer codes by one each. This can result in up to `max_categories + 2` integer codes. .. versionadded:: 1.3  Read more in the :ref:`User Guide `.",


In [47]:
OE.categories_

[array(['Small', 'Medium', 'Large'], dtype=object)]

In [44]:
df1=OE.fit_transform(df[['size_category']])
df1

Unnamed: 0,size_category
0,0.0
1,0.0
2,0.0
3,0.0
4,1.0
...,...
239,0.0
240,0.0
241,0.0
242,0.0


In [45]:
pd.concat([df,df1],axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,size_category,size_category.1
0,16.99,1.01,Female,No,Sun,Dinner,2,Small,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,Small,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,Small,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,Small,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,Medium,1.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,Small,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,Small,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,Small,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,Small,0.0


# Training and Testing Data

In [118]:
x_Train=pd.DataFrame({'Level':['Low','Medium','High']})

In [120]:
categories=x_Train.Level.values

In [136]:
oe=OrdinalEncoder(categories=[categories],handle_unknown='use_encoded_value',unknown_value=-1).set_output(transform='pandas')

In [137]:
oe.fit_transform(x_Train[['Level']]) 

Unnamed: 0,Level
0,0.0
1,1.0
2,2.0


In [138]:
X_test=pd.DataFrame({'Level':['Low','Hyderabad','Medium']})

In [139]:
oe.fit_transform(X_test[['Level']])

Unnamed: 0,Level
0,0.0
1,-1.0
2,1.0


# LabelEncoder

In [83]:
from sklearn.preprocessing import LabelEncoder

In [84]:
df=pd.read_csv(r"C:\Users\masir\Downloads\Datasets\loan_data_set.csv")

In [85]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [86]:
le=LabelEncoder()

In [87]:
le

In [90]:
df1=le.fit_transform(df[['Loan_Status']])
df2=pd.DataFrame(df1) 
df2

  y = column_or_1d(y, warn=True)


Unnamed: 0,0
0,1
1,0
2,1
3,1
4,1
...,...
609,1
610,1
611,1
612,1


In [91]:
pd.concat([df,df2],axis=1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,0
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y,1
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y,1
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y,1
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y,1


# Training and Testing Data

In [140]:
le=LabelEncoder()

In [141]:
x_Train=pd.DataFrame({'Gender':['Male','Female']})
x_Train

Unnamed: 0,Gender
0,Male
1,Female


In [144]:
le.fit_transform(x_Train[['Gender']])

  y = column_or_1d(y, warn=True)


array([1, 0])

In [143]:
x_test=pd.DataFrame({'Gender':['Male','Female','Unknown']})

In [145]:
X_test

Unnamed: 0,Level
0,Low
1,Hyderabad
2,Medium


In [146]:
le.transform(x_test[['Gender']])
# cant handle the unseen data

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


ValueError: y contains previously unseen labels: 'Unknown'

# Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv=CountVectorizer()

In [15]:
cv

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (strip_accents and lowercase) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"stop_words  stop_words: {'english'}, list, default=None If 'english', a built-in stop word list for English is used. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str or None, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'
,"ngram_range  ngram_range: tuple (min_n, max_n), default=(1, 1) The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted. All values of n such such that min_n <= n <= max_n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams. Only applies if ``analyzer`` is not callable.","(1, ...)"


In [16]:
df=pd.DataFrame({'Review':['biryani is good','I love Hyderabad']})
df

Unnamed: 0,Review
0,biryani is good
1,I love Hyderabad


In [17]:
cv.fit_transform(df['Review'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5 stored elements and shape (2, 5)>

# cv.get_feature_names_out()

In [18]:
cv.get_feature_names_out()

array(['biryani', 'good', 'hyderabad', 'is', 'love'], dtype=object)

# cv.vocabulary_

In [19]:
cv.vocabulary_

{'biryani': 0, 'is': 3, 'good': 1, 'love': 4, 'hyderabad': 2}

# TrainingData

In [44]:
df=pd.DataFrame({'Review':['I love biryani biryani','Biryani is Famous','Biryani is to Costly']})
df

Unnamed: 0,Review
0,I love biryani biryani
1,Biryani is Famous
2,Biryani is to Costly


In [45]:
x=cv.fit_transform(df['Review'])
x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 6)>

In [46]:
cv.get_feature_names_out()

array(['biryani', 'costly', 'famous', 'is', 'love', 'to'], dtype=object)

In [47]:
pd.DataFrame(x.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,biryani,costly,famous,is,love,to
0,2,0,0,0,1,0
1,1,0,1,1,0,0
2,1,1,0,1,0,1


In [48]:
df=pd.DataFrame({'Review':['I love biryani because biryani is famous','Biryani is Famous','Biryani is to Costly']})
df

Unnamed: 0,Review
0,I love biryani because biryani is famous
1,Biryani is Famous
2,Biryani is to Costly


In [49]:
cv1=CountVectorizer(lowercase=False)

In [50]:
x=cv1.fit_transform(df['Review'])
x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12 stored elements and shape (3, 9)>

In [51]:
cv1.get_feature_names_out()

array(['Biryani', 'Costly', 'Famous', 'because', 'biryani', 'famous',
       'is', 'love', 'to'], dtype=object)

In [52]:
pd.DataFrame(x.toarray(),columns=cv1.get_feature_names_out())

Unnamed: 0,Biryani,Costly,Famous,because,biryani,famous,is,love,to
0,0,0,0,1,2,1,1,1,0
1,1,0,1,0,0,0,1,0,0
2,1,1,0,0,0,0,1,0,1


# Testing Data

In [53]:
df=pd.DataFrame({'Review':['I love biryani','Biryani is Famous','Biryani is to Costly']})
df
x=cv.fit_transform(df['Review'])
x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 6)>

In [54]:
test_df = pd.DataFrame({
    'Review': [
        'I love chicken biryani',     # "chicken" ‚Üí OOV
        'Hyderabad biryani is tasty', # "hyderabad", "tasty" ‚Üí OOV
        'Biryani is very expensive'   # "very", "expensive" ‚Üí OOV
    ]
})
test_df

Unnamed: 0,Review
0,I love chicken biryani
1,Hyderabad biryani is tasty
2,Biryani is very expensive


In [55]:
y=cv.transform(test_df['Review'])
y

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6 stored elements and shape (3, 6)>

In [56]:
pd.DataFrame(y.toarray())

Unnamed: 0,0,1,2,3,4,5
0,1,0,0,0,1,0
1,1,0,0,1,0,0
2,1,0,0,1,0,0


In [57]:
pd.DataFrame(y.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,biryani,costly,famous,is,love,to
0,1,0,0,0,1,0
1,1,0,0,1,0,0
2,1,0,0,1,0,0


# strip_accents

In [22]:
cv2=CountVectorizer(strip_accents=None)

In [23]:
df=pd.DataFrame({'Review':['please keep your resume ready','this adds weightage to your resum√©']})
df
x=cv2.fit_transform(df['Review'])
x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (2, 10)>

In [24]:
cv2.vocabulary_

{'please': 2,
 'keep': 1,
 'your': 9,
 'resume': 4,
 'ready': 3,
 'this': 6,
 'adds': 0,
 'weightage': 8,
 'to': 7,
 'resum√©': 5}

In [25]:
# here resume and resum√© have same meaning but the algorithm has built the vocabulary with 2 different words

In [26]:
cv2=CountVectorizer(strip_accents='unicode')
# change the ascent to get orginal form of a word

In [27]:
x=cv2.fit_transform(df['Review'])

In [28]:
cv2.vocabulary_

{'please': 2,
 'keep': 1,
 'your': 8,
 'resume': 4,
 'ready': 3,
 'this': 5,
 'adds': 0,
 'weightage': 7,
 'to': 6}

# preprocessor

In [29]:
df 

Unnamed: 0,Review
0,please keep your resume ready
1,this adds weightage to your resum√©


In [30]:
cv3=CountVectorizer(strip_accents='unicode',preprocessor=None)

In [31]:
z=cv3.fit_transform(df['Review'])

In [32]:
pd.DataFrame(z.toarray(),columns=cv3.get_feature_names_out())

Unnamed: 0,adds,keep,please,ready,resume,this,to,weightage,your
0,0,1,1,1,1,0,0,0,1
1,1,0,0,0,1,1,1,1,1


In [33]:
def pre_processing(text):
    text = text.lower()
    text = text.replace('!', '')
    return text

In [34]:
# my own function prepre_processing
cv3=CountVectorizer(preprocessor=pre_processing)  

In [35]:
z=cv3.fit_transform(df['Review'])
pd.DataFrame(z.toarray(),columns=cv3.get_feature_names_out()) 

Unnamed: 0,adds,keep,please,ready,resume,resum√©,this,to,weightage,your
0,0,1,1,1,1,0,0,0,0,1
1,1,0,0,0,0,1,1,1,1,1


# tokenizer

In [36]:
def tokens(text):
    text=word_tokenize(text)
    return text

In [37]:
cv4=CountVectorizer(tokenizer=tokens)

In [38]:
z=cv4.fit_transform(df['Review'])
pd.DataFrame(z.toarray(),columns=cv4.get_feature_names_out()) 

Unnamed: 0,adds,keep,please,ready,resume,resum√©,this,to,weightage,your
0,0,1,1,1,1,0,0,0,0,1
1,1,0,0,0,0,1,1,1,1,1


# analyzer

In [65]:
cv5=CountVectorizer(analyzer='word') 
a=cv5.fit_transform(df['Review'])    

In [67]:
pd.DataFrame(a.toarray(),columns=cv5.get_feature_names_out())    

Unnamed: 0,adds,keep,please,ready,resume,resum√©,this,to,weightage,your
0,0,1,1,1,1,0,0,0,0,1
1,1,0,0,0,0,1,1,1,1,1


In [68]:
cv5=CountVectorizer(analyzer='char') 
a=cv5.fit_transform(df['Review'])  

In [69]:
pd.DataFrame(a.toarray(),columns=cv5.get_feature_names_out())   

Unnamed: 0,Unnamed: 1,a,d,e,g,h,i,k,l,m,o,p,r,s,t,u,w,y,√©
0,4,2,1,7,0,0,0,1,1,1,1,2,3,2,0,2,0,2,0
1,5,2,2,3,2,2,2,0,0,1,2,0,2,3,3,2,1,1,1


In [70]:
cv5=CountVectorizer(analyzer='char_wb') 
a=cv5.fit_transform(df['Review'])
pd.DataFrame(a.toarray(),columns=cv5.get_feature_names_out())   

Unnamed: 0,Unnamed: 1,a,d,e,g,h,i,k,l,m,o,p,r,s,t,u,w,y,√©
0,10,2,1,7,0,0,0,1,1,1,1,2,3,2,0,2,0,2,0
1,12,2,2,3,2,2,2,0,0,1,2,0,2,3,3,2,1,1,1


# stopwords

In [51]:
stp=stopwords.words('english')
stp

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [52]:
stp.remove('not')

In [53]:
cv6=CountVectorizer(stop_words=stp)

In [54]:
df=pd.DataFrame({'Review':['you can not do anything without help']})
df
x=cv6.fit_transform(df['Review'])
x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4 stored elements and shape (1, 4)>

In [55]:
cv6.vocabulary_

{'not': 2, 'anything': 0, 'without': 3, 'help': 1}

# toke_pattern

In [56]:
cv7=CountVectorizer()

In [59]:
df=pd.DataFrame({'col1':['i love my country']})
df

Unnamed: 0,col1
0,i love my country


In [60]:
cv7.fit_transform(df['col1'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3 stored elements and shape (1, 3)>

In [61]:
cv7.vocabulary_
# there was no 'i' in vocabulary beacuse it has default token_pattern that the word should have greater than 2 characters

{'love': 1, 'my': 2, 'country': 0}

In [62]:
cv8=CountVectorizer(token_pattern=r'(?u)\b\w+\b')
# changing the pattern to caputre the words with one character

In [63]:
cv8.fit_transform(df['col1'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4 stored elements and shape (1, 4)>

In [65]:
cv8.vocabulary_

{'i': 1, 'love': 2, 'my': 3, 'country': 0}

# ngran_range(min_n,max_n)

In [75]:
df

Unnamed: 0,col1
0,i love my country


In [84]:
cv9=CountVectorizer(ngram_range=(1,1),token_pattern=r'(?u)\b\w+\b')

In [85]:
pd.DataFrame(cv9.fit_transform(df['col1']).toarray(),columns=cv9.get_feature_names_out())

Unnamed: 0,country,i,love,my
0,1,1,1,1


In [86]:
cv9.vocabulary_

{'i': 1, 'love': 2, 'my': 3, 'country': 0}

In [87]:
cv1=CountVectorizer(ngram_range=(2,2),token_pattern=r'(?u)\b\w+\b')

In [88]:
pd.DataFrame(cv1.fit_transform(df['col1']).toarray(),columns=cv1.get_feature_names_out())

Unnamed: 0,i love,love my,my country
0,1,1,1


In [89]:
cv1.vocabulary_

{'i love': 0, 'love my': 1, 'my country': 2}

In [92]:
cv2=CountVectorizer(ngram_range=(1,2),token_pattern=r'(?u)\b\w+\b')

In [93]:
pd.DataFrame(cv2.fit_transform(df['col1']).toarray(),columns=cv2.get_feature_names_out())

Unnamed: 0,country,i,i love,love,love my,my,my country
0,1,1,1,1,1,1,1


In [94]:
cv2.vocabulary_

{'i': 1,
 'love': 3,
 'my': 5,
 'country': 0,
 'i love': 2,
 'love my': 4,
 'my country': 6}

In [96]:
cv3=CountVectorizer(ngram_range=(4,4),token_pattern=r'(?u)\b\w+\b')

In [97]:
pd.DataFrame(cv3.fit_transform(df['col1']).toarray(),columns=cv3.get_feature_names_out())

Unnamed: 0,i love my country
0,1


In [98]:
cv3.vocabulary_

{'i love my country': 0}

# min_df & max_df

In [123]:
df=pd.DataFrame({'col':['i love hyderabad biryani','in hyderabad biryani is famous','shagouse is famous for biryani','biryani is to costly','biryani is love','my freind like biryani']})
df

Unnamed: 0,col
0,i love hyderabad biryani
1,in hyderabad biryani is famous
2,shagouse is famous for biryani
3,biryani is to costly
4,biryani is love
5,my freind like biryani


In [124]:
cv4=CountVectorizer(min_df=2,max_df=5,token_pattern=r'(?u)\b\w+\b')

In [125]:
pd.DataFrame(cv4.fit_transform(df['col']).toarray(),columns=cv4.get_feature_names_out())

Unnamed: 0,famous,hyderabad,is,love
0,0,1,0,1
1,1,1,1,0
2,1,0,1,0
3,0,0,1,0
4,0,0,1,1
5,0,0,0,0


In [126]:
cv4.vocabulary_

{'love': 3, 'hyderabad': 1, 'is': 2, 'famous': 0}

# max_features

In [127]:
df=pd.DataFrame({'col':['i love hyderabad biryani','in hyderabad biryani is famous','shagouse is famous for biryani','biryani is to costly','biryani is love','my freind like biryani']})
df

Unnamed: 0,col
0,i love hyderabad biryani
1,in hyderabad biryani is famous
2,shagouse is famous for biryani
3,biryani is to costly
4,biryani is love
5,my freind like biryani


In [128]:
cv5=CountVectorizer(max_features=3)

In [129]:
cv5.fit_transform(df['col'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12 stored elements and shape (6, 3)>

In [130]:
cv5.vocabulary_

{'hyderabad': np.int64(1), 'biryani': np.int64(0), 'is': np.int64(2)}

# binary bag of words

In [134]:
df=pd.DataFrame({'col':['i love hyderabad biryani biryani is is ','biryani biryani hello love love is is is']})
df

Unnamed: 0,col
0,i love hyderabad biryani biryani is is
1,biryani biryani hello love love is is is


In [135]:
cv6=CountVectorizer()
pd.DataFrame(cv6.fit_transform(df['col']).toarray(),columns=cv6.get_feature_names_out())

Unnamed: 0,biryani,hello,hyderabad,is,love
0,2,0,1,2,1
1,2,1,0,3,2


In [136]:
cv6=CountVectorizer(binary=True)
pd.DataFrame(cv6.fit_transform(df['col']).toarray(),columns=cv6.get_feature_names_out())

Unnamed: 0,biryani,hello,hyderabad,is,love
0,1,0,1,1,1
1,1,1,0,1,1


# TF-IDF

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
train_df=pd.DataFrame({'col':['Biryani is good','Biryani is not tasty']})
test_df=pd.DataFrame({'col':['Pizza is good','Biryani is tasty']})

In [64]:
train_df

Unnamed: 0,col
0,Biryani is good
1,Biryani is not tasty


In [65]:
test_df

Unnamed: 0,col
0,Pizza is good
1,Biryani is tasty


In [66]:
tfidf=TfidfVectorizer()

In [67]:
tfidf.fit_transform(train_df['col'])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7 stored elements and shape (2, 5)>

In [68]:
pd.DataFrame(tfidf.fit_transform(train_df['col']).toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,biryani,good,is,not,tasty
0,0.501549,0.704909,0.501549,0.0,0.0
1,0.409937,0.0,0.409937,0.576152,0.576152


In [69]:
tfidf.idf_

array([1.        , 1.40546511, 1.        , 1.40546511, 1.40546511])

In [70]:
pd.DataFrame(tfidf.transform(test_df['col']).toarray()) 

Unnamed: 0,0,1,2,3,4
0,0.0,0.814802,0.579739,0.0,0.0
1,0.501549,0.0,0.501549,0.0,0.704909


In [71]:
pd.DataFrame(tfidf.transform(test_df['col']).toarray(),columns=tfidf.get_feature_names_out()) 

Unnamed: 0,biryani,good,is,not,tasty
0,0.0,0.814802,0.579739,0.0,0.0
1,0.501549,0.0,0.501549,0.0,0.704909


# use_idf

In [97]:
train_df=pd.DataFrame({'col':['Biryani is good','Biryani is not tasty']})

In [98]:
tfidf=TfidfVectorizer(use_idf=True)

In [99]:
pd.DataFrame(tfidf.fit_transform(train_df['col']).toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,biryani,good,is,not,tasty
0,0.501549,0.704909,0.501549,0.0,0.0
1,0.409937,0.0,0.409937,0.576152,0.576152


In [100]:
tfidf.idf_

array([1.        , 1.40546511, 1.        , 1.40546511, 1.40546511])

In [109]:
tfidf=TfidfVectorizer(use_idf=False)

In [110]:
pd.DataFrame(tfidf.fit_transform(train_df['col']).toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,biryani,good,is,not,tasty
0,0.57735,0.57735,0.57735,0.0,0.0
1,0.5,0.0,0.5,0.5,0.5


In [111]:
tfidf.idf_

AttributeError: 'TfidfTransformer' object has no attribute 'idf_'

# smooth_idf

In [112]:
train_df=pd.DataFrame({'col':['Biryani is good','Biryani is not tasty']})
train_df

Unnamed: 0,col
0,Biryani is good
1,Biryani is not tasty


In [113]:
tfidf=TfidfVectorizer(use_idf=True,smooth_idf=True)

In [114]:
pd.DataFrame(tfidf.fit_transform(train_df['col']).toarray(),columns=tfidf.get_feature_names_out())

Unnamed: 0,biryani,good,is,not,tasty
0,0.501549,0.704909,0.501549,0.0,0.0
1,0.409937,0.0,0.409937,0.576152,0.576152
