NLP Machine Learning Pipeline for Twitter Sentiment Analysis

In [None]:
import os
print(os.getcwd())

In [None]:
import nltk
print(nltk.__file__)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

In [6]:
df=pd.read_csv('twitter_training.csv',header=None,names=['id','topic','label','review'])

In [10]:
df=pd.DataFrame(df)
df.isnull().sum()

id        0
topic     0
label     0
review    0
dtype: int64

In [11]:
df.head()
df.shape

(73996, 4)

In [8]:
df.dropna(inplace=True)

In [12]:
X=df['review']
y=df['label']

Cleaning text:

Lowercase

Remove punctuation

break sentence into words

remove stopwords

convert words into their base form

join text into a clean sentence

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [20]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens=word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    # tokens = [ for word in tokens]
    return " ".join(tokens)


In [21]:
df['clean_review']=df['review'].apply(clean_text)

In [None]:
# remove empty rows
# df = df[df["clean_text"].str.strip() != ""]


In [22]:
print(df['clean_review'].head())

0    im getting borderland murder
1              coming border kill
2      im getting borderland kill
3     im coming borderland murder
4    im getting borderland murder
Name: clean_review, dtype: object


In [23]:
X_train,X_test,y_train,y_test=train_test_split(df['clean_review'],df['label'],test_size=0.25,random_state=42)

TF- Term Frequency: how many time a word repeated itself?

IDF- Inverse Document Frequency: how rare a word is?

Higher score to the words that are frequent but overall rare.

In [24]:
vectorizer=TfidfVectorizer()
X_train_vec=vectorizer.fit_transform(X_train)
X_test_vec=vectorizer.transform(X_test)

In [25]:
model=MultinomialNB()
model.fit(X_train_vec,y_train)
y_pred=model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test,y_pred))
print("Classification Report:\n", classification_report(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
print('F1 Score:', f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.7170657873398562
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.95      0.42      0.58      3348
    Negative       0.64      0.90      0.75      5477
     Neutral       0.84      0.62      0.71      4528
    Positive       0.69      0.80      0.74      5146

    accuracy                           0.72     18499
   macro avg       0.78      0.69      0.70     18499
weighted avg       0.76      0.72      0.71     18499

Confusion Matrix:
 [[1394  952  211  791]
 [  23 4909  165  380]
 [  27 1008 2822  671]
 [  18  808  180 4140]]
F1 Score: 0.7076071217200118


In [26]:
print("train_accuracy",accuracy_score(y_train,model.predict(X_train_vec)))
print("test_accuracy",accuracy_score(y_test,y_pred))

train_accuracy 0.7751590176045552
test_accuracy 0.7170657873398562


NLP Learning Tasks

In [None]:
!pip install nltk


In [17]:
corpus="""Hello Welcome, myself Mehnoor Fayyaz, a student of software engineering.
Currently in my final year and working on my Final Year Project. I'm
also learning Natural Language Processing and its applications."""
print(corpus)

Hello Welcome, myself Mehnoor Fayyaz, a student of software engineering.
Currently in my final year and working on my Final Year Project. I'm
also learning Natural Language Processing and its applications.


In [13]:
nltk.data.path.append("C:/Users/mehno/AppData/Roaming/nltk_data")


In [14]:
import nltk
print(nltk.data.path)

['C:\\Users\\mehno/nltk_data', 'c:\\Users\\mehno\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data', 'c:\\Users\\mehno\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data', 'c:\\Users\\mehno\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data', 'C:\\Users\\mehno\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'C:/Users/mehno/AppData/Roaming/nltk_data', 'C:\\Users\\mehno\\AppData\\Roaming\\nltk_data', 'C:\\\\Users\\\\mehno\\\\AppData\\\\Roaming\\\\nltk_data', 'C:/Users/mehno/AppData/Roaming/nltk_data']


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [15]:
import nltk
nltk.data.path = []

# Add ONLY the correct NLTK folder
nltk.data.path.append(r"C:\Users\mehno\AppData\Roaming\nltk_data")

print("New paths:", nltk.data.path)

New paths: ['C:\\Users\\mehno\\AppData\\Roaming\\nltk_data']


In [19]:
import nltk

nltk.download('punkt', force=True, download_dir=r"C:\Users\mehno\AppData\Roaming\nltk_data")
nltk.download('punkt_tab', force=True, download_dir=r"C:\Users\mehno\AppData\Roaming\nltk_data")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

Tokenization

In [21]:
#Tokenization
#sent tokenize: paragraphs --> sentences
from nltk.tokenize import sent_tokenize

def sent_tokenize_corpus(corpus,language='english'):
    return sent_tokenize(corpus, language=language) 

documents=sent_tokenize_corpus(corpus)
print(documents)


['Hello Welcome, myself Mehnoor Fayyaz, a student of software engineering.', 'Currently in my final year and working on my Final Year Project.', "I'm\nalso learning Natural Language Processing and its applications."]


In [23]:
from nltk.tokenize import word_tokenize
tokens=word_tokenize(corpus)

print(tokens)

['Hello', 'Welcome', ',', 'myself', 'Mehnoor', 'Fayyaz', ',', 'a', 'student', 'of', 'software', 'engineering', '.', 'Currently', 'in', 'my', 'final', 'year', 'and', 'working', 'on', 'my', 'Final', 'Year', 'Project', '.', 'I', "'m", 'also', 'learning', 'Natural', 'Language', 'Processing', 'and', 'its', 'applications', '.']


Stemming

In [30]:
words=['eating','eaten','eats','programming','programmed','programmer','program','finally','finalized','final']

In [33]:
from nltk.stem import PorterStemmer
stemming=PorterStemmer()

In [34]:
for word in words:
    print(word,'-->',stemming.stem(word))

eating --> eat
eaten --> eaten
eats --> eat
programming --> program
programmed --> program
programmer --> programm
program --> program
finally --> final
finalized --> final
final --> final


In [35]:
stemming.stem('congratulations')


'congratul'

Regular Expression can take a singular expression and remove any prefix and suffix that matches the expression 

In [37]:
from nltk.stem import RegexpStemmer
reg_stemmer=RegexpStemmer('ing$|ed$|s$|er$|ly$|al$', min=4)

reg_stemmer.stem('congratulations')


'congratulation'

Snowball Stemmer

In [38]:
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

for w in words:
    print(w,'-->',stemmer.stem(w))

eating --> eat
eaten --> eaten
eats --> eat
programming --> program
programmed --> program
programmer --> programm
program --> program
finally --> final
finalized --> final
final --> final


In [39]:
stemmer.stem('fairly')

'fair'

Lemmatization 

word into its root word instead of stem; a valid word is returned

In [44]:
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()

# n-Noun
#a-Adjective
#verb-v
#adverb-r


print(lemma.lemmatize('congratulations',pos='a'))
print(lemma.lemmatize('congratulations',pos='n'))
print(lemma.lemmatize('congratulations',pos='v'))




congratulations
congratulation
congratulations


Stopwords

In [46]:
paragraph=""" Pakistan,[f] officially the Islamic Republic of Pakistan,[g] is a country in South Asia. It is the fifth-most populous country, with a population of over 241.5 million,[d] having the second-largest Muslim population as of 2023. Islamabad is the nation's capital, while Karachi is its largest city and financial centre. Pakistan is the 33rd-largest country by area. Bounded by the Arabian Sea on the south, the Gulf of Oman on the southwest, and the Sir Creek on the southeast, it shares land borders with India to the east; Afghanistan to the west; Iran to the southwest; and China to the northeast. It shares a maritime border with Oman in the Gulf of Oman, and is separated from Tajikistan in the northwest by Afghanistan's narrow Wakhan Corridor.

Pakistan is the site of several ancient cultures, including the 8,500-year-old Neolithic site of Mehrgarh in Balochistan, the Indus Valley Civilisation of the Bronze Age,[8] and the ancient Gandhara civilisation.[9] The regions that compose the modern state of Pakistan were the realm of multiple empires and dynasties, including the Achaemenid, the Maurya, the Kushan, the Gupta;[10] the Umayyad Caliphate in its southern regions, the Hindu Shahis, the Ghaznavids, the Delhi Sultanate, the Samma, the Shah Miris, the Mughals,[11] and finally, the British Raj from 1858 to 1947.

Spurred by the Pakistan Movement, which sought a homeland for the Muslims of British India, and election victories in 1946 by the All-India Muslim League, Pakistan gained independence in 1947 after the partition of British India, which awarded separate statehood to its Muslim-majority regions and was accompanied by an unparalleled mass migration and loss of life.[12][13] Initially a Dominion of the British Commonwealth, Pakistan officially drafted its constitution in 1956, and emerged as a declared Islamic republic. In 1971, the exclave of East Pakistan seceded as the new country of Bangladesh after a nine-month-long civil war. In the following four decades, Pakistan has been ruled by governments that alternated between civilian and military, democratic and authoritarian, relatively secular and Islamist.
"""

In [47]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

stemmer=PorterStemmer()
doc=nltk.sent_tokenize(paragraph)


In [50]:
for i in range(len(doc)):
    words=nltk.word_tokenize(doc[i])
    words=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    doc[i]=" ".join(words)

print(doc)    

['pakistan , [ f ] offici islam republ pakistan , [ g ] countri south asia .', 'it fifth-most popul countri , popul 241.5 million , [ ] second-largest muslim popul 2023 .', "islamabad nation 's capit , karachi largest citi financi centr .", 'pakistan 33rd-largest countri area .', 'bound arabian sea south , gulf oman southwest , sir creek southeast , share land border india east ; afghanistan west ; iran southwest ; china northeast .', "it share maritim border oman gulf oman , separ tajikistan northwest afghanistan 's narrow wakhan corridor .", 'pakistan site sever ancient cultur , includ 8,500-year-old neolith site mehrgarh balochistan , indu valley civilis bronz age , [ 8 ] ancient gandhara civilis .', '[ 9 ] the region compos modern state pakistan realm multipl empir dynasti , includ achaemenid , maurya , kushan , gupta ; [ 10 ] umayyad caliph southern region , hindu shahi , ghaznavid , delhi sultan , samma , shah miri , mughal , [ 11 ] final , british raj 1858 1947 .', 'spur pakista

Parts of Speech Tagging

In [52]:
import nltk
nltk.download('averaged_perceptron_tagger_eng',force=True, download_dir=r"C:\Users\mehno\AppData\Roaming\nltk_data")


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [53]:
for i in range(len(doc)):
    words=nltk.word_tokenize(doc[i])
    words=[word for word in words if word not in set(stopwords.words('english'))]
    # doc[i]=" ".join(words)

pos=nltk.pos_tag(words)
print(pos)

[('follow', 'VB'), ('four', 'CD'), ('decad', 'NN'), (',', ','), ('pakistan', 'JJ'), ('rule', 'NN'), ('govern', 'JJ'), ('altern', 'JJ'), ('civilian', 'JJ'), ('militari', 'NN'), (',', ','), ('democrat', 'NN'), ('authoritarian', 'JJ'), (',', ','), ('rel', 'JJ'), ('secular', 'JJ'), ('islamist', 'NN'), ('.', '.')]


In [71]:
sentence="""Pakistan,[f] officially the Islamic Republic of Pakistan,[g] is a country in South Asia. It is the fifth-most populous country, with a population of over 241.5 million,[d] having the second-largest Muslim population as of 2023. Islamabad is the nation's capital, while Karachi is its largest city and financial centre. Pakistan is the 33rd-largest country by area. Bounded by the Arabian Sea on the south, the Gulf of Oman on the southwest, and the Sir Creek on the southeast, it shares land borders with India to the east; Afghanistan to the west; Iran to the southwest; and China to the northeast. 
"""

POS tagging assigns a part of speech to each word in a sentence.

Basically, it labels whether a word is a noun, verb, adjective, adverb, etc.

In [68]:
for i in word:
    tag_elements=nltk.pos_tag([i])
    print(tag_elements)

[('Pakistan,[f]', 'NN')]
[('officially', 'RB')]
[('the', 'DT')]
[('Islamic', 'NNP')]
[('Republic', 'JJ')]
[('of', 'IN')]
[('Pakistan,[g]', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('country', 'NN')]
[('in', 'IN')]
[('South', 'NNP')]
[('Asia.', 'NN')]
[('It', 'PRP')]
[('is', 'VBZ')]
[('the', 'DT')]
[('fifth-most', 'NN')]
[('populous', 'JJ')]
[('country,', 'NN')]
[('with', 'IN')]
[('a', 'DT')]
[('population', 'NN')]
[('of', 'IN')]
[('over', 'IN')]
[('241.5', 'CD')]
[('million,[d]', 'NN')]
[('having', 'VBG')]
[('the', 'DT')]
[('second-largest', 'NN')]
[('Muslim', 'NN')]
[('population', 'NN')]
[('as', 'IN')]
[('of', 'IN')]
[('2023.', 'CD')]
[('Islamabad', 'NN')]
[('is', 'VBZ')]
[('the', 'DT')]
[("nation's", 'NN')]
[('capital,', 'NN')]
[('while', 'IN')]
[('Karachi', 'NNP')]
[('is', 'VBZ')]
[('its', 'PRP$')]
[('largest', 'JJS')]
[('city', 'NN')]
[('and', 'CC')]
[('financial', 'JJ')]
[('centre.', 'NN')]
[('Pakistan', 'NN')]
[('is', 'VBZ')]
[('the', 'DT')]
[('33rd-largest', 'JJ')]
[('country', 'N

In [75]:
word=nltk.word_tokenize(sentence)

In [76]:
tagged=nltk.pos_tag(words)

In [65]:
import nltk
nltk.download('maxent_ne_chunker_tab',force=True, download_dir=r"C:\Users\mehno\AppData\Roaming\nltk_data")

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker_tab.zip.


True

In [79]:
nltk.download('words', download_dir=r"C:\Users\mehno\AppData\Roaming\nltk_data")


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\mehno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

NER finds important “named” things in text, like:

Person names → "Elon Musk"

Organizations → "NASA"

Locations → "Paris"

Dates → "January 1, 2025"

Named Entity Recognition

In [80]:
nltk.ne_chunk(tagged).draw()

One Hot Encoding

In [2]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()

labels=['positive','negative','neutral','positive','negative']
x=label.fit_transform(labels)
print(x)

[2 0 1 2 0]


Embeddings

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   --- ------------------------------------ 2.4/24.4 MB 9.6 MB/s eta 0:00:03
   ----- ---------------------------------- 3.1/24.4 MB 4.4 MB/s eta 0:00:05
   ------ --------------------------------- 4.2/24.4 MB 2.0 MB/s eta 0:00:11
   --------- ------------------------------ 5.8/24.4 MB 2.3 MB/s eta 0:00:08
   ----------- ---------------------------- 7.1/24.4 MB 2.5 MB/s eta 0:00:07
   --------------- ------------------------ 9.2/24.4 MB 3.0 MB/s eta 0:00:06
   ------------------ --------------------- 11.5/24.4 MB 3.3 MB/s eta 0:00:04
   ------------------- -------------------- 12.1/24.4 MB 3.2 MB/s eta 0:00:04
   -------------------- ------------------- 12.6/24.4 


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [2]:
import gensim.downloader as api
wv=api.load('word2vec-google-news-300')
vec=wv['cricket']
print(vec)

[-3.67187500e-01 -1.21582031e-01  2.85156250e-01  8.15429688e-02
  3.19824219e-02 -3.19824219e-02  1.34765625e-01 -2.73437500e-01
  9.46044922e-03 -1.07421875e-01  2.48046875e-01 -6.05468750e-01
  5.02929688e-02  2.98828125e-01  9.57031250e-02  1.39648438e-01
 -5.41992188e-02  2.91015625e-01  2.85156250e-01  1.51367188e-01
 -2.89062500e-01 -3.46679688e-02  1.81884766e-02 -3.92578125e-01
  2.46093750e-01  2.51953125e-01 -9.86328125e-02  3.22265625e-01
  4.49218750e-01 -1.36718750e-01 -2.34375000e-01  4.12597656e-02
 -2.15820312e-01  1.69921875e-01  2.56347656e-02  1.50146484e-02
 -3.75976562e-02  6.95800781e-03  4.00390625e-01  2.09960938e-01
  1.17675781e-01 -4.19921875e-02  2.34375000e-01  2.03125000e-01
 -1.86523438e-01 -2.46093750e-01  3.12500000e-01 -2.59765625e-01
 -1.06933594e-01  1.04003906e-01 -1.79687500e-01  5.71289062e-02
 -7.41577148e-03 -5.59082031e-02  7.61718750e-02 -4.14062500e-01
 -3.65234375e-01 -3.35937500e-01 -1.54296875e-01 -2.39257812e-01
 -3.73046875e-01  2.27355