# Feature Extraction from texts

In [1]:
# count vectorizor

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd


vectorizer = CountVectorizer()

document = [
    'I love natural language processing.',
    'Count vectorization is useful to nlp',
    'nlp  allows machines to understand human language.',
]

X = vectorizer.fit_transform(document)

columns = vectorizer.get_feature_names_out()

df = pd.DataFrame(X.toarray(), columns=columns)

print("Vocabulary : ", vectorizer.vocabulary_)
print()
df

Vocabulary :  {'love': 5, 'natural': 7, 'language': 4, 'processing': 9, 'count': 1, 'vectorization': 13, 'is': 3, 'useful': 12, 'to': 10, 'nlp': 8, 'allows': 0, 'machines': 6, 'understand': 11, 'human': 2}



Unnamed: 0,allows,count,human,is,language,love,machines,natural,nlp,processing,to,understand,useful,vectorization
0,0,0,0,0,1,1,0,1,0,1,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,1,0,1,1
2,1,0,1,0,1,0,1,0,1,0,1,1,0,0


In [2]:
import urllib3
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Geoffrey_Hinton"

http = urllib3.PoolManager()

response = http.request('GET', url)

soup = BeautifulSoup(response.data, 'html.parser')

text = soup.get_text()

with open('Geoffrey_Hinton_article.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Article saved to Geoffrey_Hinton_article.txt")

Article saved to Geoffrey_Hinton_article.txt


In [3]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")

with open('Geoffrey_Hinton_article.txt', encoding='utf-8') as f:
    text = f.read()

# text = text.replace("\n", " ")

# text = re.sub(r'[^a-zA-Z\s]', ' ', text)


text = text.lower()

doc = nlp(text)

sentencizer = nlp.add_pipe("sentencizer")

doc = nlp(text)

document = [sent.text for sent in doc.sents ]

document[5]

'after having difficulty getting funding in britain,[40] he worked in the us at the university of california, san diego and carnegie mellon university.[37] he was the founding director of the gatsby charitable foundation computational neuroscience unit at university college london.[37]'

In [4]:
X = vectorizer.fit_transform(document)

columns = vectorizer.get_feature_names_out()

df = pd.DataFrame(X.toarray(), columns=columns)

print("Vocabulary : ", vectorizer.vocabulary_)
print()
df

Vocabulary :  {'geoffrey': 1087, 'hinton': 1202, 'wikipedia': 2409, 'jump': 1353, 'to': 2254, 'content': 733, 'main': 1502, 'menu': 1559, 'move': 1605, 'sidebar': 2068, 'hide': 1195, 'navigation': 1628, 'pagecontentscurrent': 1713, 'eventsrandom': 964, 'articleabout': 442, 'wikipediacontact': 2410, 'us': 2309, 'contribute': 739, 'helplearn': 1180, 'editcommunity': 896, 'portalrecent': 1788, 'changesupload': 652, 'filespecial': 1014, 'pages': 1715, 'search': 2037, 'appearance': 421, 'donate': 869, 'create': 766, 'account': 332, 'log': 1475, 'in': 1251, 'personal': 1752, 'tools': 2262, 'for': 1026, 'logged': 1476, 'out': 1702, 'editors': 898, 'learn': 1435, 'more': 1595, 'contributionstalk': 741, 'contents': 735, 'top': 2263, 'education': 904, 'career': 620, 'and': 408, 'research': 1921, 'toggle': 2258, 'subsection': 2157, 'honours': 1211, 'awards': 477, 'views': 2338, 'risks': 1947, 'of': 1676, 'artificial': 445, 'intelligence': 1281, 'existential': 972, 'risk': 1946, 'from': 1055, 'agi

Unnamed: 0,0006,003,0080,01,013,0362,06,09,09829,10,...,zoubin,álvarez,özlem,şahin,беларускаябългарскиcatalàčeštinadanskdeutschespañolesperantoeuskaraفارسیfrançaisgaeilgegalego한국어հայերենbahasa,رlatinalietuviųمصرىnederlands日本語norsk,سرائیکیshqipsimple,तम,ನಡქართულიکٲش,ไทยtürkçeукраїнськаاردوئۇيغۇرچە
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,1,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
552,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

document = [
    'I love natural language processing.',
    'Count vectorization is useful to nlp',
    'nlp  allows machines to understand human language.',
]

vectorizer = TfidfVectorizer(stop_words='english',
                             lowercase = True,
                             ngram_range=(1,1),
                             norm = 'l2')

X = vectorizer.fit_transform(document)

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix : \n")
df



TF-IDF Matrix : 



Unnamed: 0,allows,count,human,language,love,machines,natural,nlp,processing,understand,useful,vectorization
0,0.0,0.0,0.0,0.40204,0.528635,0.0,0.528635,0.0,0.528635,0.0,0.0,0.0
1,0.0,0.528635,0.0,0.0,0.0,0.0,0.0,0.40204,0.0,0.0,0.528635,0.528635
2,0.440362,0.0,0.440362,0.334907,0.0,0.440362,0.0,0.334907,0.0,0.440362,0.0,0.0


In [6]:
# Sentiment Analysis

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [7]:
texts = [
    "I love this product",
    "This is amazing",
    "Best thing I have ever used",
    "I hate it",
    "Worst experience ever",
    "this is bad",
    "Not great",
    "Quite good overall",
    "Terrible and disappointing",
    "Absolutely fantastic",
    "I love this",
    "I hate this",
    "Amazing work",
    "Terrible mistake",
    "I am so happy",
    "I am very sad",
    "Not good at all",
    "Absolutely fantastic",
    "Worst thing ever",
    "best day ever",
    "Awful experience",
    "Superb performance",
    "I don't think it is nice"
]

In [8]:
labels = [1,1,1,0,0,0,0,1,0,1,1,0,1,0,1,0,0,1,0,1,0,1,0]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=labels)

In [10]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [11]:
nb = MultinomialNB()
nb.fit(X_train_cv, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [12]:
y_pred = nb.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       0.00      0.00      0.00         3

    accuracy                           0.57         7
   macro avg       0.29      0.50      0.36         7
weighted avg       0.33      0.57      0.42         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[4, 0],
       [3, 0]])

In [14]:
new_sentences = ["This is awesome!","I don't like this","Could be better"]

new_vectors = cv.transform(new_sentences)

predictions = nb.predict(new_vectors)

for sent, label in zip(new_sentences, predictions):
    print(f"Sentence: {sent} -> Sentiment -> {'Positive' if label == 1 else 'Negative'}")

Sentence: This is awesome! -> Sentiment -> Positive
Sentence: I don't like this -> Sentiment -> Negative
Sentence: Could be better -> Sentiment -> Negative
