<a href="https://colab.research.google.com/github/JennyFrost/LLMs/blob/main/LogReg_%26_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def read_infile(infile):
    sents, labels = [], []
    with open(infile, "r", encoding="Windows-1251") as fin:
        for line in fin:
            line = line.strip()
            if line == "":
                continue
            label, sent = line.split()[0], ' '.join(line.split()[1:])
            sents.append(sent)
            labels.append(label)
    return sents, labels

In [None]:
train_data, train_labels = read_infile("train.txt")
test_data, test_labels = read_infile("test.txt")
print(len(train_labels), len(test_labels))
print(set(train_labels))
print(train_data[2])

5452 500
{'ENTY:lang', 'ENTY:symbol', 'LOC:state', 'ENTY:body', 'ENTY:cremat', 'ENTY:dismed', 'NUM:money', 'LOC:mount', 'NUM:ord', 'NUM:date', 'LOC:other', 'ENTY:product', 'NUM:weight', 'ABBR:abb', 'LOC:country', 'ENTY:techmeth', 'ENTY:religion', 'ENTY:color', 'ENTY:sport', 'ABBR:exp', 'ENTY:other', 'NUM:count', 'NUM:code', 'ENTY:plant', 'ENTY:veh', 'DESC:manner', 'HUM:desc', 'HUM:ind', 'NUM:period', 'HUM:gr', 'HUM:title', 'LOC:city', 'NUM:perc', 'ENTY:event', 'DESC:def', 'ENTY:animal', 'NUM:speed', 'ENTY:instru', 'DESC:reason', 'ENTY:word', 'NUM:dist', 'ENTY:letter', 'DESC:desc', 'ENTY:substance', 'ENTY:termeq', 'ENTY:currency', 'NUM:temp', 'ENTY:food', 'NUM:volsize', 'NUM:other'}
How can I find a list of celebrities ' real names ?


In [None]:
import spacy

nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])

In [None]:
def normalize_sent(data):
    if isinstance(data, list):
        processed_sents = list(nlp.pipe(data))
        return [normalize_sent(sent) for sent in processed_sents]
    elif isinstance(data, str):
        processed_sent = nlp(data)
    else:
        processed_sent = data
    answer = [token.lemma_ if token.lemma_ != "-PRON-" else token.text.lower() for token in processed_sent]

    return answer

In [None]:
train_data = normalize_sent(train_data)
test_data = normalize_sent(test_data)
print(test_data[13])

['who', 'be', 'the', 'first', 'man', 'to', 'fly', 'across', 'the', 'Pacific', 'Ocean', '?']


In [None]:
from collections import defaultdict

def remove_punct(sent):
    words = [word for word in sent if word not in ",.\()„\"«»;:\`"]
    words = [word for word in words if word != ""]
    return words

def get_ngrams(sent, ngram_length=3, to_lower=False):
    if to_lower:
        sent = sent.lower()
    sent = remove_punct(sent)
    answer = []
    for curr_ngram_length in range(1, min(ngram_length, len(sent))+1):
        for end in range(curr_ngram_length, len(sent)+1):
            start = end-curr_ngram_length
            answer.append(' '.join(sent[start:end]))
    ngrams = defaultdict(int)
    for ngram in answer:
        ngrams[ngram] += 1
    return ngrams

In [None]:
import tqdm

class DataProcessor:

    def __init__(self, ngram_length=3, min_count=1):
        self.ngram_length = ngram_length
        self.min_count = min_count

    def fit(self, data):
        ngram_counts = defaultdict(int)
        for sent in tqdm.notebook.tqdm(data):
            sent_ngram_counts = get_ngrams(sent, ngram_length=self.ngram_length)
            for ngram in sent_ngram_counts:
                ngram_counts[ngram] += 1
        self.ngrams = sorted(ngram for ngram, count in ngram_counts.items() if count >= self.min_count)
        self.ngram_codes = {ngram: i for i, ngram in enumerate(self.ngrams)}
        print("{} энграмм в словаре.".format(len(self.ngrams)))
        return self

    def transform(self, data):
        return [self.transform_sent(sent) for sent in data]

    def transform_sent(self, sent):
        ngrams = get_ngrams(sent, ngram_length=self.ngram_length)
        answer = [0] * len(self.ngrams)
        for ngram, count in ngrams.items():
            code = self.ngram_codes.get(ngram)
            if code is not None:
                answer[code] = count
        return answer

In [None]:
from scipy.sparse import csr_matrix

class SparseDataProcessor(DataProcessor):

    def transform(self, data):
        values, rows, columns = [], [], []
        for i, sent in enumerate(tqdm.notebook.tqdm(data)):
            ngrams = get_ngrams(sent, ngram_length=self.ngram_length)
            for ngram, count in ngrams.items():
                code = self.ngram_codes.get(ngram)
                if code is not None:
                    values.append(count)
                    rows.append(i)
                    columns.append(code)
        answer = csr_matrix((values, (rows, columns)), shape=(len(data), len(self.ngram_codes)))
        return answer

In [None]:
data_processor = SparseDataProcessor(min_count=3, ngram_length=3)
data_processor.fit(train_data)
X_train = data_processor.transform(train_data)
X_test = data_processor.transform(test_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5452.0), HTML(value='')))


5590 энграмм в словаре.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5452.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))




## $Logistic$ $regression$

In [None]:
from sklearn.linear_model import LogisticRegression

cls = LogisticRegression(max_iter=500)
cls.fit(X_train, train_labels)

LogisticRegression(max_iter=500)

In [None]:
pred_labels = cls.predict(X_test)
print(test_labels[:20])
print(pred_labels[:20])

['NUM:dist', 'LOC:city', 'HUM:desc', 'DESC:def', 'NUM:date', 'NUM:dist', 'HUM:gr', 'ENTY:plant', 'DESC:reason', 'DESC:def', 'LOC:city', 'HUM:ind', 'NUM:weight', 'HUM:ind', 'NUM:date', 'NUM:other', 'ENTY:substance', 'HUM:ind', 'DESC:def', 'NUM:date']
['NUM:dist' 'LOC:other' 'HUM:desc' 'DESC:def' 'NUM:date' 'NUM:dist'
 'HUM:gr' 'DESC:def' 'DESC:reason' 'DESC:def' 'LOC:city' 'HUM:ind'
 'NUM:weight' 'HUM:ind' 'NUM:date' 'NUM:period' 'ENTY:other' 'HUM:ind'
 'DESC:def' 'NUM:date']


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

accuracy = accuracy_score(test_labels, pred_labels)
print("Корректность: {:.2f}".format(100 * accuracy))
prec, rec, f1, sup = precision_recall_fscore_support(test_labels, pred_labels)
print("Точность:")
for label, x in zip(cls.classes_, prec):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")
print("Полнота:")
for label, x in zip(cls.classes_, rec):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")
print("F1-мера:")
for label, x in zip(cls.classes_, f1):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")

Корректность: 80.00
Точность:
ABBR:abb: 100.00
ABBR:exp: 100.00
DESC:def: 77.85
DESC:desc: 50.00
DESC:manner: 40.00
DESC:reason: 100.00
ENTY:animal: 100.00
ENTY:body: 100.00
ENTY:color: 100.00
ENTY:cremat: 0.00
ENTY:currency: 100.00
ENTY:dismed: 0.00
ENTY:event: 0.00
ENTY:food: 100.00
ENTY:instru: 100.00
ENTY:lang: 100.00
ENTY:letter: 29.41
ENTY:other: 100.00
ENTY:plant: 0.00
ENTY:product: 100.00
ENTY:religion: 71.43
ENTY:sport: 100.00
ENTY:substance: 53.85
ENTY:symbol: 100.00
ENTY:techmeth: 75.00
ENTY:termeq: 60.00
ENTY:veh: 83.08
ENTY:word: 0.00
HUM:desc: 100.00
HUM:gr: 100.00
HUM:ind: 100.00
HUM:title: 79.25
LOC:city: 71.43
LOC:country: 81.82
LOC:mount: 100.00
LOC:other: 100.00
LOC:state: 0.00
NUM:code: 100.00
NUM:count: 100.00
NUM:date: 70.00
NUM:dist: 100.00
NUM:money: 100.00
NUM:ord: 100.00

Полнота:
ABBR:abb: 100.00
ABBR:exp: 75.00
DESC:def: 100.00
DESC:desc: 71.43
DESC:manner: 100.00
DESC:reason: 100.00
ENTY:animal: 50.00
ENTY:body: 100.00
ENTY:color: 100.00
ENTY:cremat: 0.00
E

## $Naive$ $Bayes$

In [None]:
from sklearn.naive_bayes import MultinomialNB

cls_nb = MultinomialNB()
cls_nb.fit(X_train, train_labels)

MultinomialNB()

In [None]:
pred_labels_nb = cls_nb.predict(X_test)
print(test_labels[:20])
print(pred_labels_nb[:20])

['NUM:dist', 'LOC:city', 'HUM:desc', 'DESC:def', 'NUM:date', 'NUM:dist', 'HUM:gr', 'ENTY:plant', 'DESC:reason', 'DESC:def', 'LOC:city', 'HUM:ind', 'NUM:weight', 'HUM:ind', 'NUM:date', 'NUM:other', 'ENTY:substance', 'HUM:ind', 'DESC:def', 'NUM:date']
['NUM:dist' 'LOC:other' 'HUM:ind' 'DESC:def' 'NUM:date' 'NUM:dist'
 'HUM:gr' 'DESC:def' 'DESC:reason' 'DESC:def' 'LOC:city' 'HUM:ind'
 'DESC:def' 'HUM:ind' 'NUM:date' 'DESC:def' 'LOC:country' 'HUM:ind'
 'DESC:def' 'NUM:date']


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

accuracy = accuracy_score(test_labels, pred_labels_nb)
print("Корректность: {:.2f}".format(100 * accuracy))
prec, rec, f1, sup = precision_recall_fscore_support(test_labels, pred_labels)
print("Точность:")
for label, x in zip(cls_nb.classes_, prec):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")
print("Полнота:")
for label, x in zip(cls_nb.classes_, rec):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")
print("F1-мера:")
for label, x in zip(cls_nb.classes_, f1):
    print("{}: {:.2f}".format(label, 100*x), end="\n")
print("")

Корректность: 67.40
Точность:
ABBR:abb: 100.00
ABBR:exp: 100.00
DESC:def: 77.85
DESC:desc: 50.00
DESC:manner: 40.00
DESC:reason: 100.00
ENTY:animal: 100.00
ENTY:body: 100.00
ENTY:color: 100.00
ENTY:cremat: 0.00
ENTY:currency: 100.00
ENTY:dismed: 0.00
ENTY:event: 0.00
ENTY:food: 100.00
ENTY:instru: 100.00
ENTY:lang: 100.00
ENTY:letter: 29.41
ENTY:other: 100.00
ENTY:plant: 0.00
ENTY:product: 100.00
ENTY:religion: 71.43
ENTY:sport: 100.00
ENTY:substance: 53.85
ENTY:symbol: 100.00
ENTY:techmeth: 75.00
ENTY:termeq: 60.00
ENTY:veh: 83.08
ENTY:word: 0.00
HUM:desc: 100.00
HUM:gr: 100.00
HUM:ind: 100.00
HUM:title: 79.25
LOC:city: 71.43
LOC:country: 81.82
LOC:mount: 100.00
LOC:other: 100.00
LOC:state: 0.00
NUM:code: 100.00
NUM:count: 100.00
NUM:date: 70.00
NUM:dist: 100.00
NUM:money: 100.00
NUM:ord: 100.00

Полнота:
ABBR:abb: 100.00
ABBR:exp: 75.00
DESC:def: 100.00
DESC:desc: 71.43
DESC:manner: 100.00
DESC:reason: 100.00
ENTY:animal: 50.00
ENTY:body: 100.00
ENTY:color: 100.00
ENTY:cremat: 0.00
E

## Определение значимости признаков

In [None]:
import numpy as np
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import label_binarize

Y_train = label_binarize(train_labels, cls.classes_, sparse_output=True)
feature_counts_by_classes = safe_sparse_dot(Y_train.T, (X_train >= 1), dense_output=True)

def get_count_feature_importance(X, Y, classes):
    feature_counts_by_classes = safe_sparse_dot(Y.T, (X >= 1), dense_output=True)
    log_feature_count = np.log2(1.0 + feature_counts_by_classes.sum(axis=0))
    feature_probs_by_classes = feature_counts_by_classes / feature_counts_by_classes.sum(axis=0)
    K = Y.shape[1]
    feature_weights_by_classes = log_feature_count * (feature_probs_by_classes - 1/K)
    answer = defaultdict(dict)
    for i, feat_weights in enumerate(feature_weights_by_classes.T):
        for label, weight in zip(classes, feat_weights):
            if weight > 0:
                answer[label][i] = weight
    return answer

 'ENTY:animal' 'ENTY:body' 'ENTY:color' 'ENTY:cremat' 'ENTY:currency'
 'ENTY:dismed' 'ENTY:event' 'ENTY:food' 'ENTY:instru' 'ENTY:lang'
 'ENTY:letter' 'ENTY:other' 'ENTY:plant' 'ENTY:product' 'ENTY:religion'
 'ENTY:sport' 'ENTY:substance' 'ENTY:symbol' 'ENTY:techmeth' 'ENTY:termeq'
 'ENTY:veh' 'ENTY:word' 'HUM:desc' 'HUM:gr' 'HUM:ind' 'HUM:title'
 'LOC:city' 'LOC:country' 'LOC:mount' 'LOC:other' 'LOC:state' 'NUM:code'
 'NUM:count' 'NUM:date' 'NUM:dist' 'NUM:money' 'NUM:ord' 'NUM:other'
 'NUM:perc' 'NUM:period' 'NUM:speed' 'NUM:temp' 'NUM:volsize' 'NUM:weight'] as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [None]:
Y_train = label_binarize(train_labels, classes=cls.classes_)

importances_by_classes = get_count_feature_importance(X_train, Y_train, classes=cls.classes_)
useful_features = set()
for label in cls.classes_:
    for feat, importance in sorted(importances_by_classes[label].items(), key=lambda x: -x[1])[:400]:
        useful_features.add(feat)
useful_features = sorted(useful_features)
X_train_new = X_train[:,useful_features]
X_test_new = X_test[:,useful_features]

In [None]:
from sklearn.metrics import f1_score

cls_small = LogisticRegression(max_iter=500).fit(X_train_new, train_labels)
pred_labels = cls_small.predict(X_test_new)

accuracy = accuracy_score(test_labels, pred_labels)
print("Корректность: {:.2f}".format(100 * accuracy))
f1 = f1_score(test_labels, pred_labels, average=None)
for label, x in zip(cls_small.classes_, f1):
    print("{}: {:.2f}".format(label, 100*x), end="\t")
print("")

Корректность: 79.80
ABBR:abb: 100.00	ABBR:exp: 85.71	DESC:def: 87.23	DESC:desc: 58.82	DESC:manner: 57.14	DESC:reason: 100.00	ENTY:animal: 66.67	ENTY:body: 100.00	ENTY:color: 100.00	ENTY:cremat: 0.00	ENTY:currency: 50.00	ENTY:dismed: 0.00	ENTY:event: 0.00	ENTY:food: 66.67	ENTY:instru: 100.00	ENTY:lang: 66.67	ENTY:letter: 29.63	ENTY:other: 33.33	ENTY:plant: 0.00	ENTY:product: 100.00	ENTY:religion: 45.45	ENTY:sport: 100.00	ENTY:substance: 70.00	ENTY:symbol: 40.00	ENTY:techmeth: 85.71	ENTY:termeq: 54.55	ENTY:veh: 88.52	ENTY:word: 0.00	HUM:desc: 90.91	HUM:gr: 100.00	HUM:ind: 80.00	HUM:title: 82.35	LOC:city: 71.43	LOC:country: 90.00	LOC:mount: 97.83	LOC:other: 66.67	LOC:state: 0.00	NUM:code: 58.82	NUM:count: 80.00	NUM:date: 77.78	NUM:dist: 66.67	NUM:money: 75.00	NUM:ord: 40.00	


In [None]:
importances_by_classes = get_count_feature_importance(X_train, Y_train, classes=cls.classes_)
for n_feat_for_class in [100, 200, 400, 1000, 2000]:
    useful_features = set()
    for label in cls.classes_:
        for feat, importance in sorted(importances_by_classes[label].items(), key=lambda x: -x[1])[:n_feat_for_class]:
            useful_features.add(feat)
    useful_features = sorted(useful_features)
    X_train_new = X_train[:,useful_features]
    X_test_new = X_test[:,useful_features]
    cls_small = LogisticRegression(max_iter=500).fit(X_train_new, train_labels)
    pred_labels = cls_small.predict(X_test_new)

    accuracy = accuracy_score(test_labels, pred_labels)
    print("{} признаков, корректность: {:.2f}".format(len(useful_features), 100 * accuracy))
    f1 = f1_score(test_labels, pred_labels, average=None)
    for label, x in zip(cls_small.classes_, f1):
        print("{}: {:.2f}".format(label, 100*x), end="\t")
    print("")

3473 признаков, корректность: 79.40
ABBR:abb: 100.00	ABBR:exp: 85.71	DESC:def: 90.11	DESC:desc: 45.45	DESC:manner: 57.14	DESC:reason: 100.00	ENTY:animal: 64.00	ENTY:body: 100.00	ENTY:color: 100.00	ENTY:cremat: 0.00	ENTY:currency: 50.00	ENTY:dismed: 0.00	ENTY:event: 0.00	ENTY:food: 66.67	ENTY:instru: 100.00	ENTY:lang: 66.67	ENTY:letter: 26.09	ENTY:other: 33.33	ENTY:plant: 0.00	ENTY:product: 100.00	ENTY:religion: 57.14	ENTY:sport: 66.67	ENTY:substance: 70.00	ENTY:symbol: 40.00	ENTY:techmeth: 100.00	ENTY:termeq: 54.55	ENTY:veh: 83.33	ENTY:word: 0.00	HUM:desc: 90.91	HUM:gr: 100.00	HUM:ind: 80.00	HUM:title: 79.21	LOC:city: 71.43	LOC:country: 90.00	LOC:mount: 98.92	LOC:other: 60.87	LOC:state: 0.00	NUM:code: 58.82	NUM:count: 80.00	NUM:date: 73.68	NUM:dist: 66.67	NUM:money: 57.14	NUM:ord: 40.00	
4671 признаков, корректность: 79.80
ABBR:abb: 100.00	ABBR:exp: 85.71	DESC:def: 87.54	DESC:desc: 58.82	DESC:manner: 57.14	DESC:reason: 100.00	ENTY:animal: 66.67	ENTY:body: 66.67	ENTY:color: 100.00	ENTY: