In [18]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    """
    Performs text preprocessing steps.

    Args:
        text: The input text.

    Returns:
        Preprocessed text.
    """

    # Lowercase
    text = text.lower()

    

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return ' '.join(lemmatized_tokens)

# Load your dataset
data = pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
data=data.iloc[0:100]
data['review']=data['review'].apply(remove_tags)
# Apply preprocessing to the 'review' column
data['preprocessed_review'] = data['review'].apply(preprocess_text)

print(data.head(10))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. The filming tec...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   
5  Probably my all-time favorite movie, a story o...  positive   
6  I sure would like to see a resurrection of a u...  positive   
7  This show was an amazing, fresh & innovative i...  negative   
8  Encouraged by the positive comments about this...  negative   
9  If you like original gut wrenching laughter yo...  positive   

                                 preprocessed_review  
0  one review mention watch 1 oz episod youll hoo...  
1  wonder littl product film techniqu unassum old...  
2  thought wonder way spend time hot summer weeke...  
3  basic there famili littl boy jake think there ... 

In [12]:
data['review'][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [13]:
data['preprocessed_review'][2]

'thought wonder way spend time hot summer weekend sit air condit theater watch lightheart comedi plot simplist dialogu witti charact likabl even well bread suspect serial killer may disappoint realiz match point 2 risk addict thought proof woodi allen still fulli control style mani u grown lovethi id laugh one woodi comedi year dare say decad ive never impress scarlet johanson manag tone sexi imag jump right averag spirit young womanthi may crown jewel career wittier devil wear prada interest superman great comedi go see friend'

## Lowercasing

In [9]:
sen1="""Text preprocessing is a crucial step in natural language processing (NLP), where raw text—often noisy and unstructured—is transformed into a format that can be effectively utilized by machine learning algorithms. For example, consider a sentence like: 'The quick brown foxes, were jumping over the lazy dogs!' During preprocessing, one might tokenize the sentence, remove stopwords such as 'the', 'and', or 'were', and normalize text by converting everything to lowercase. Additionally, stemming or lemmatization could be applied to words like 'jumping', 'foxes', and 'dogs' to reduce them to their base forms: 'jump', 'fox', and 'dog'. By applying these techniques, the text becomes easier for models to analyze and process."""
sen1.lower()
import pandas as pd
data=pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
df=data.iloc[:5000]
df['review']=df['review'].apply(lambda x: x.lower())
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(lambda x: x.lower())


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## Stop Words Removal

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Remove Pantuations


In [13]:

import string
string.punctuation

exclude=string.punctuation

def rem_pun(text):
    for char in exclude:
        text=text.replace(char,'')
    return text

import re 
def remove_tags(raw_text):
    cle_text=re.sub(re.compile('<.*?>'),'',raw_text)
    return cle_text

df['review']=df['review'].apply(remove_tags)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review']=df['review'].apply(remove_tags)


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [9]:
rem_pun(df)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
4995,an interesting slasher film with multiple susp...,negative
4996,i watched this series when it first came out i...,positive
4997,once again jet li brings his charismatic prese...,positive
4998,"i rented this movie, after hearing chris gore ...",negative


## chatword removal

In [10]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [8]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [13]:
text = 'IMO he is the best'
text1 = 'FYI Mumbai is the capital of Maharashtra'
# Calling function
print(chat_conversion(text))
print(chat_conversion(text1))

In My Opinion he is the best
For Your Information Mumbai is the capital of Maharashtra


## Tokenization


1. using basic split function

In [14]:
sen1="""Text preprocessing is a crucial step in natural language processing (NLP), where raw text—often noisy and unstructured—is transformed into a format that can be effectively utilized by machine learning algorithms. For example, consider a sentence like: 'The quick brown foxes, were jumping over the lazy dogs!' During preprocessing, one might tokenize the sentence, remove stopwords such as 'the', 'and', or 'were', and normalize text by converting everything to lowercase. Additionally, stemming or lemmatization could be applied to words like 'jumping', 'foxes', and 'dogs' to reduce them to their base forms: 'jump', 'fox', and 'dog'. By applying these techniques, the text becomes easier for models to analyze and process."""
sen1.split(".")

['Text preprocessing is a crucial step in natural language processing (NLP), where raw text—often noisy and unstructured—is transformed into a format that can be effectively utilized by machine learning algorithms',
 " For example, consider a sentence like: 'The quick brown foxes, were jumping over the lazy dogs!' During preprocessing, one might tokenize the sentence, remove stopwords such as 'the', 'and', or 'were', and normalize text by converting everything to lowercase",
 " Additionally, stemming or lemmatization could be applied to words like 'jumping', 'foxes', and 'dogs' to reduce them to their base forms: 'jump', 'fox', and 'dog'",
 ' By applying these techniques, the text becomes easier for models to analyze and process',
 '']

2. Regular Expression

In [15]:
import re
tokens=re.findall("[\w']+",sen1)
tokens

  tokens=re.findall("[\w']+",sen1)


['Text',
 'preprocessing',
 'is',
 'a',
 'crucial',
 'step',
 'in',
 'natural',
 'language',
 'processing',
 'NLP',
 'where',
 'raw',
 'text',
 'often',
 'noisy',
 'and',
 'unstructured',
 'is',
 'transformed',
 'into',
 'a',
 'format',
 'that',
 'can',
 'be',
 'effectively',
 'utilized',
 'by',
 'machine',
 'learning',
 'algorithms',
 'For',
 'example',
 'consider',
 'a',
 'sentence',
 'like',
 "'The",
 'quick',
 'brown',
 'foxes',
 'were',
 'jumping',
 'over',
 'the',
 'lazy',
 'dogs',
 "'",
 'During',
 'preprocessing',
 'one',
 'might',
 'tokenize',
 'the',
 'sentence',
 'remove',
 'stopwords',
 'such',
 'as',
 "'the'",
 "'and'",
 'or',
 "'were'",
 'and',
 'normalize',
 'text',
 'by',
 'converting',
 'everything',
 'to',
 'lowercase',
 'Additionally',
 'stemming',
 'or',
 'lemmatization',
 'could',
 'be',
 'applied',
 'to',
 'words',
 'like',
 "'jumping'",
 "'foxes'",
 'and',
 "'dogs'",
 'to',
 'reduce',
 'them',
 'to',
 'their',
 'base',
 'forms',
 "'jump'",
 "'fox'",
 'and',
 "'do

# NLTK Tokenizer

In [14]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [15]:
def tokenize_df(df):
  df['tokenized_reviews'] = df['review'].apply(word_tokenize)
  return df

# Tokenize the reviews
df = tokenize_df(df)
print(df.head())

                                              review sentiment  \
0  one of the other reviewers has mentioned that ...  positive   
1  a wonderful little production. the filming tec...  positive   
2  i thought this was a wonderful way to spend ti...  positive   
3  basically there's a family where a little boy ...  negative   
4  petter mattei's "love in the time of money" is...  positive   

                                   tokenized_reviews  
0  [one, of, the, other, reviewers, has, mentione...  
1  [a, wonderful, little, production, ., the, fil...  
2  [i, thought, this, was, a, wonderful, way, to,...  
3  [basically, there, 's, a, family, where, a, li...  
4  [petter, mattei, 's, ``, love, in, the, time, ...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized_reviews'] = df['review'].apply(word_tokenize)


In [17]:
# using spacy

In [19]:
import spacy
tok=spacy.load('en_core_web_sm')

In [14]:
doc1=tok(sen1)
doc2=tok(sen2)

In [15]:
for token in doc2:
    print(token)

my
email
I.D
is
maratheharshal005@gmail.com


# 4. Stemming

In [16]:
from nltk.stem.porter import PorterStemmer

In [18]:


def stem_tokens(tokens):
  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(token) for token in tokens]
  return stemmed_tokens
    
def stem_dataframe(df):
  df['stemmed_reviews'] = df['tokenized_reviews'].apply(stem_tokens)
  return df

# Stem the tokens
df = stem_dataframe(df)
df.iloc[0:30,3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stemmed_reviews'] = df['tokenized_reviews'].apply(stem_tokens)


0     [one, of, the, other, review, ha, mention, tha...
1     [a, wonder, littl, product, ., the, film, tech...
2     [i, thought, thi, wa, a, wonder, way, to, spen...
3     [basic, there, 's, a, famili, where, a, littl,...
4     [petter, mattei, 's, ``, love, in, the, time, ...
5     [probabl, my, all-tim, favorit, movi, ,, a, st...
6     [i, sure, would, like, to, see, a, resurrect, ...
7     [thi, show, wa, an, amaz, ,, fresh, &, innov, ...
8     [encourag, by, the, posit, comment, about, thi...
9     [if, you, like, origin, gut, wrench, laughter,...
10    [phil, the, alien, is, one, of, those, quirki,...
11    [i, saw, thi, movi, when, i, wa, about, 12, wh...
12    [so, im, not, a, big, fan, of, boll, 's, work,...
13    [the, cast, play, shakespeare.shakespear, lost...
14    [thi, a, fantast, movi, of, three, prison, who...
15    [kind, of, drawn, in, by, the, erot, scene, ,,...
16    [some, film, just, simpli, should, not, be, re...
17    [thi, movi, made, it, into, one, of, my, t

# 5. lemmitization

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\marat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN, 
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_tokens(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
  return lemmatized_tokens

def lemmatize_dataframe(df):
  df['lemmatized_reviews'] = df['tokenized_reviews'].apply(lemmatize_tokens)
  return df

# Lemmatize the tokens
df = lemmatize_dataframe(df[:100])
df.iloc[1:30,4]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmatized_reviews'] = df['tokenized_reviews'].apply(lemmatize_tokens)


1     [a, wonderful, little, production, ., the, fil...
2     [i, thought, this, be, a, wonderful, way, to, ...
3     [basically, there, 's, a, family, where, a, li...
4     [petter, mattei, 's, ``, love, in, the, time, ...
5     [probably, my, all-time, favorite, movie, ,, a...
6     [i, sure, would, like, to, see, a, resurrectio...
7     [this, show, be, an, amaze, ,, fresh, &, innov...
8     [encourage, by, the, positive, comment, about,...
9     [if, you, like, original, gut, wrench, laughte...
10    [phil, the, alien, be, one, of, those, quirky,...
11    [i, saw, this, movie, when, i, be, about, 12, ...
12    [so, im, not, a, big, fan, of, boll, 's, work,...
13    [the, cast, played, shakespeare.shakespeare, l...
14    [this, a, fantastic, movie, of, three, prisone...
15    [kind, of, drawn, in, by, the, erotic, scene, ...
16    [some, film, just, simply, should, not, be, re...
17    [this, movie, make, it, into, one, of, my, top...
18    [i, remember, this, film, ,, it, be, the, 

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running thought and eating laughter at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations=["?:!.,';"]
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words :
    if word in punctuations:
        sentence_words.remove(word)

sentence_words

print("{0:20}{1:20}".format("Word","Lemma"))
for word in  sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word,pos='v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
thought             think               
and                 and                 
eating              eat                 
laughter            laughter            
at                  at                  
same                same                
time                time                
.                   .                   
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun             

# feature extraction
 

## OneHot Encoding

In [18]:
from sklearn.preprocessing import OneHotEncoder
import itertools

document = ["The","boy","sat","on","the","floor"]
#we have to convert these tokens to a dictionary with the key as word and value as position

tokens = [doc.split(" ") for doc in document]
token_chain = itertools.chain.from_iterable(tokens)
word_to_id = {token: idx for idx, token in enumerate(set(token_chain))}
#word_to_id is our required dictionary
#Get the corresponding values for each word
token_ids = [[word_to_id[token] for token in toke] for toke in tokens]

vec = OneHotEncoder(categories="auto")
V = vec.fit_transform(token_ids)
print(V.toarray())

[[0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


## Bag of words

In [21]:
import numpy as np
import pandas as pd

In [22]:
df=pd.DataFrame({'text':['student learn NLP','NLP learn NLP','student use BOW','NLP use BOW'],'output':[1,1,0,0]})
df

Unnamed: 0,text,output
0,student learn NLP,1
1,NLP learn NLP,1
2,student use BOW,0
3,NLP use BOW,0


In [23]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [24]:
#bag of words
bow=cv.fit_transform(df['text'])
#vocab
print("Vocabulory ")
print(cv.vocabulary_)
#for each document 
print("\nFor each document")
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

#for whole corpurs
print("\nFor corpus")
cv.transform(["campusx watch and write comment of campusx"]).toarray()

Vocabulory 
{'student': 3, 'learn': 1, 'nlp': 2, 'use': 4, 'bow': 0}

For each document
[[0 1 1 1 0]]
[[0 1 2 0 0]]
[[1 0 0 1 1]]
[[1 0 1 0 1]]

For corpus


array([[0, 0, 0, 0, 0]], dtype=int64)

## Term Frequency and Inverse Document Frequency (TF-IDF)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
text = ['student learn NLP','NLP learn NLP','student use BOW','NLP use BOW']
tf = TfidfVectorizer()
txt_fit = tf.fit(text)
txt_transform = txt_fit.transform(text)
idf = tf.idf_
print(dict(zip(txt_fit.get_feature_names_out(), idf)))


{'bow': 1.5108256237659907, 'learn': 1.5108256237659907, 'nlp': 1.2231435513142097, 'student': 1.5108256237659907, 'use': 1.5108256237659907}


## N-gram

In [26]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter


In [30]:
text = "This is a sample text for n-gram modeling. It is used to demonstrate the concept. It is a text classification technique in NLP"
words = word_tokenize(text)

import nltk
def generate_ngrams(words, n):
  ngrams = list(nltk.ngrams(words, n))
  return ngrams

bigrams = generate_ngrams(words, 2)
bigram_counts = Counter(bigrams)

total_bigrams = sum(bigram_counts.values())

def calculate_probability(bigram):
  return bigram_counts[bigram] / total_bigrams

# Example usage
probability_of_is_a = calculate_probability(('is', 'a'))
print(probability_of_is_a)

0.08333333333333333


In [31]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Your DataFrame
df = pd.DataFrame({'text':['student learn NLP','NLP learn NLP','student use BOW','NLP use BOW'],'output':[1,1,0,0]})

def generate_ngrams(text, n):
  words = word_tokenize(text)
  return list(ngrams(words, n))

# Example: Generate bigrams
df['bigrams'] = df['text'].apply(lambda x: generate_ngrams(x, 2))

print(df.head())


                text  output                           bigrams
0  student learn NLP       1  [(student, learn), (learn, NLP)]
1      NLP learn NLP       1      [(NLP, learn), (learn, NLP)]
2    student use BOW       0      [(student, use), (use, BOW)]
3        NLP use BOW       0          [(NLP, use), (use, BOW)]


# text classification 

# Naive Bayes classifier

In [32]:
data = pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
data=data.iloc[0:100]
X = data.iloc[:,0:1]
y = data['sentiment']

In [57]:
data = pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
data=data.iloc[0:100]
X = data.iloc[:,0:1]
y = data['sentiment']
from sklearn.preprocessing import LabelEncoder


encoder = LabelEncoder()
y = encoder.fit_transform(y)



from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)

y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


# Define a function to preprocess and classify new text
def classify_text(text):
    # Transform new text to Bag-of-Words representation using the same CountVectorizer
    text_bow = cv.transform([text]).toarray()
    
    # Predict sentiment
    prediction = gnb.predict(text_bow)
    
    # Convert numeric prediction back to original label
    sentiment = encoder.inverse_transform(prediction)
    
    return sentiment[0]

# Example usage
new_text = """This a fantastic movie of three prisoners who become famous. One of the actors is george clooney and I'm not a fan but this roll is not bad. Another good thing about the movie is the soundtrack (The man of constant sorrow). I recommend this movie to everybody. Greetings Bart"""
sentiment = classify_text(new_text)
print(f"The sentiment of the new text is: {sentiment}")

0.8
[[10  1]
 [ 3  6]]
The sentiment of the new text is: positive


In [34]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8
[[10  1]
 [ 3  6]]


In [61]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

# Define a function to preprocess and classify new text
def classify_text(text):
    # Transform new text to Bag-of-Words representation using the same CountVectorizer
    text_bow = cv.transform([text]).toarray()
    
    # Predict sentiment
    prediction = rf.predict(text_bow)
    
    # Convert numeric prediction back to original label
    sentiment = encoder.inverse_transform(prediction)
    
    return sentiment[0]

# Given text
new_text = """This a fantastic movie of three prisoners who become famous. One of the actors is george clooney and I'm not a fan but this roll is not bad. Another good thing about the movie is the soundtrack (The man of constant sorrow). I recommend this movie to everybody. Greetings Bart"""

# Get sentiment for the new text
sentiment = classify_text(new_text)
print(f"The sentiment of the new text is: {sentiment}")


0.75
[[11  0]
 [ 5  4]]
The sentiment of the new text is: positive


In [36]:
# maximum occuring fearures
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.7
[[11  0]
 [ 6  3]]


In [37]:
#usinf TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

#random forest
rf = RandomForestClassifier()
rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)
print("Random Forest")
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_tfidf,y_train)

y_pred = gnb.predict(X_test_tfidf)
print("Naive Bayes")
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Random Forest
0.7
[[11  0]
 [ 6  3]]
Naive Bayes
0.8
[[9 2]
 [2 7]]


## logistic regression

In [59]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Load and preprocess the data
data = pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
data = data.iloc[0:100]
X = data.iloc[:, 0:1]
y = data['sentiment']

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply BoW
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_bow, y_train)

# Predict
y_pred = log_reg.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# Define a function to preprocess and classify new text
def classify_text(text):
    # Transform new text to Bag-of-Words representation using the same CountVectorizer
    text_bow = cv.transform([text]).toarray()
    
    # Predict sentiment
    prediction = log_reg.predict(text_bow)
    
    # Convert numeric prediction back to original label
    sentiment = encoder.inverse_transform(prediction)
    
    return sentiment[0]

# Given text
new_text = """Kind of drawn in by the erotic scenes, only to realize this was one of the most amateurish and unbelievable bits of film I've ever seen. Sort of like a high school film project. What was Rosanna Arquette thinking?? And what was with all those stock characters in that bizarre supposed Midwest town? Pretty hard to get involved with this one. No lessons to be learned from it, no brilliant insights, just stilted and quite ridiculous (but lots of skin, if that intrigues you) videotaped nonsense....What was with the bisexual relationship, out of nowhere, after all the heterosexual encounters. And what was with that absurd dance, with everybody playing their stereotyped roles? Give this one a pass, it's like a million other miles of bad, wasted film, money that could have been spent on starving children or Aids in Africa....."""

# Get sentiment for the new text
sentiment = classify_text(new_text)
print(f"The sentiment of the new text is: {sentiment}")


Accuracy: 0.9
Confusion Matrix:
 [[10  1]
 [ 1  8]]
The sentiment of the new text is: negative


# entropy

In [62]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load and preprocess the data
data = pd.read_csv(r"C:\Users\marat\OneDrive\Desktop\2nd year\project\IMDB Dataset.csv")
data = data.iloc[0:100]
X = data.iloc[:, 0:1]
y = data['sentiment']

# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply BoW
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)
y_pred_nb = nb.predict(X_test_bow)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_bow, y_train)
y_pred_log_reg = log_reg.predict(X_test_bow)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(X_train_bow, y_train)
y_pred_rf = rf.predict(X_test_bow)

# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Accuracy:", accuracy_score(y_true, y_pred))
    print(f"{model_name} Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Print results
evaluate_model(y_test, y_pred_nb, "Naive Bayes")
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")


Naive Bayes Accuracy: 0.8
Naive Bayes Confusion Matrix:
 [[11  0]
 [ 4  5]]
Logistic Regression Accuracy: 0.9
Logistic Regression Confusion Matrix:
 [[10  1]
 [ 1  8]]
Random Forest Accuracy: 0.8
Random Forest Confusion Matrix:
 [[11  0]
 [ 4  5]]


In [39]:
from sklearn.metrics import log_loss
# Get predicted probabilities for the test set
y_prob = model.predict_proba(X_test_bow)
# Calculate the cross-entropy loss (entropy)
entropy = log_loss(y_test, y_prob)
print(f"Cross-Entropy Loss (Entropy): {entropy:.4f}")


Cross-Entropy Loss (Entropy): 0.2376


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize the dataset
train_encodings = tokenizer(X_train_bow.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test_bow.tolist(), truncation=True, padding=True, max_length=128)

# Create PyTorch datasets
import torch
class MovieReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MovieReviewsDataset(train_encodings, y_train.tolist())
test_dataset = MovieReviewsDataset(test_encodings, y_test.tolist())

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


OSError: [WinError 126] The specified module could not be found. Error loading "C:\python123\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [6]:
import torch

print("PyTorch version:", torch.__version__)
x = torch.rand(5, 3)
print(x)


OSError: [WinError 126] The specified module could not be found. Error loading "C:\python123\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.