
# Лабораторна робота 12
## Основи обробки природної мови (NLP)
**Мета:** Познайомитися з базовими техніками NLP, такими як токенізація, лемматизація, векторизація тексту, а також навчитися застосовувати прості моделі для класифікації текстів.


In [1]:
import nltk
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexeipavlenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexeipavlenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexeipavlenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['sci.med', 'sci.space']
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
data = pd.DataFrame({'text': dataset.data, 'target': dataset.target})

data.head()


Unnamed: 0,text,target
0,From: menon@boulder.Colorado.EDU (Ravi or Dean...,0
1,From: pjc@jet.uk (Peter J Card)\nSubject: Re: ...,1
2,From: arthurc@sfsuvax1.sfsu.edu (Arthur Chandl...,1
3,From: mrf4276@egbsun12.NoSubdomain.NoDomain (M...,1
4,From: sysmgr@king.eng.umd.edu (Doug Mohney)\nS...,1


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    try:
        text = text.lower()
        tokens = word_tokenize(text) 
        tokens = [word for word in tokens if word not in string.punctuation and word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens] 
        return ' '.join(tokens)
    except LookupError:
        return text 

data['clean_text'] = data['text'].apply(preprocess_text)

if 'clean_text' in data.columns:
    print(data[['text', 'clean_text']].head())
else:
    print("Error: Column 'clean_text' was not created.")


                                                text  \
0  From: menon@boulder.Colorado.EDU (Ravi or Dean...   
1  From: pjc@jet.uk (Peter J Card)\nSubject: Re: ...   
2  From: arthurc@sfsuvax1.sfsu.edu (Arthur Chandl...   
3  From: mrf4276@egbsun12.NoSubdomain.NoDomain (M...   
4  From: sysmgr@king.eng.umd.edu (Doug Mohney)\nS...   

                                          clean_text  
0  from: menon@boulder.colorado.edu (ravi or dean...  
1  from: pjc@jet.uk (peter j card)\nsubject: re: ...  
2  from: arthurc@sfsuvax1.sfsu.edu (arthur chandl...  
3  from: mrf4276@egbsun12.nosubdomain.nodomain (m...  
4  from: sysmgr@king.eng.umd.edu (doug mohney)\ns...  


In [6]:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['clean_text'])
y = data['target']

X.shape


(1977, 32977)

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

nb_accuracy = accuracy_score(y_test, nb_pred)
print(f"Naive Bayes Accuracy: {nb_accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, nb_pred))
print("Classification Report:")
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy: 0.9772727272727273
Confusion Matrix:
[[189   4]
 [  5 198]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       193
           1       0.98      0.98      0.98       203

    accuracy                           0.98       396
   macro avg       0.98      0.98      0.98       396
weighted avg       0.98      0.98      0.98       396



In [9]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, lr_pred))
print("Classification Report:")
print(classification_report(y_test, lr_pred))


Logistic Regression Accuracy: 0.9646464646464646
Confusion Matrix:
[[191   2]
 [ 12 191]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       193
           1       0.99      0.94      0.96       203

    accuracy                           0.96       396
   macro avg       0.97      0.97      0.96       396
weighted avg       0.97      0.96      0.96       396

