<a href="https://colab.research.google.com/github/JuliaVin23/Projects/blob/main/%D0%A4%D1%83%D0%BD%D0%BA%D1%86%D0%B8%D0%B8_%D0%B4%D0%BB%D1%8F_%D0%BE%D0%B1%D1%80%D0%B0%D0%B1%D0%BE%D1%82%D0%BA%D0%B8_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Создание функций для обработки текста при помощи ООП
##Извлечение именованных сущностей, лемматизация и разметка частей речи

#NERStats

In [None]:
!python -m spacy download en_core_web_sm

2023-12-19 15:53:26.322135: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-19 15:53:26.322202: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-19 15:53:26.324382: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-19 15:53:26.337702: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https:

In [None]:
import spacy

In [None]:
class NERExtractor:
  def __init__(self):
    self.nlp = spacy.load("en_core_web_sm")
    self.named_entities = []

  def extract_named_entities(self, text):
    doc = self.nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

  def display_named_entities(self):
    for text, label in self.named_entities:
      print(f"Named entities: {text} - {label}")


ner_extractor = NERExtractor()
ner_extractor.named_entities = ner_extractor.extract_named_entities("Apple Inc. is a technology company.")
ner_extractor.display_named_entities()


Named entities: Apple Inc. - ORG


In [None]:
from collections import Counter
import re # regex - библиотека для использования регулярных выражений

def remove_punctuation(input_string):
    result_string = re.sub(r'[^\w\s]', '', input_string)
    return result_string

class NERStats:
    def __init__(self, result_string):
        self.result_string = result_string
        self.ner_extractor = NERExtractor()
        self.ner_frequencies = self._analyze_entities()

    def _analyze_entities(self):
        ner_extractor = self.ner_extractor.extract_named_entities(self.result_string)
        ner_frequencies = Counter(ner_extractor)
        return ner_frequencies

    def display_most_common_entities(self, n):
        most_common_entities = self.ner_frequencies.most_common(n)
        print(f"Top {n} most common entities:")
        for word, frequency in most_common_entities:
            print(f"{word}: {frequency}")

text_analyzer = NERStats("Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve Jobs and Ronald Wayne to develop and sell Wozniak's Apple I personal computer. It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977.")
text_analyzer.display_most_common_entities(2)

text_analyzer = NERStats("Apple is an American company. The US made Apple. Apple phones are very popular in the US.")
text_analyzer.display_most_common_entities(2)

Top 2 most common entities:
('Apple', 'ORG'): 2
('Apple Computer Company', 'ORG'): 1
Top 2 most common entities:
('Apple', 'ORG'): 3
('US', 'GPE'): 2


#LemmaStats

In [None]:
from collections import Counter
import re # regex - библиотека для использования регулярных выражений

def remove_punctuation(input_string):
    result_string = re.sub(r'[^\w\s]', '', input_string)
    return result_string

class LemmaStats:
  def __init__(self, result_string):
    import spacy
    self.nlp = spacy.load("en_core_web_sm")
    self.result_string = result_string
    self.lemma_frequencies = self.lemmatize()

  def lemmatize(self):
    doc = self.nlp(self.result_string)
    lemmatized_text = [token.lemma_ for token in doc]
    lemma_frequencies = Counter(lemmatized_text)
    return lemma_frequencies

  def display_most_common_lemmas(self, n):
    most_common_lemmas = self.lemma_frequencies.most_common(n)
    print(f"Top {n} most common lemmas:")
    for word, frequency in most_common_lemmas:
        print(f"{word}: {frequency}")

input_string = 'Python is a very common programming language. I am a user of Python language. Many people who speak different languages are using Python right now.'
result_string = remove_punctuation(input_string)

text_analyzer = LemmaStats(result_string)
text_analyzer.display_most_common_lemmas(3)

Top 3 most common lemmas:
Python: 3
be: 3
language: 3


#PoSStats

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter


In [None]:
import re # regex - библиотека для использования регулярных выражений

def remove_punctuation(input_string):
    result_string = re.sub(r'[^\w\s]', '', input_string)
    return result_string


class PoSStats:
  def __init__(self, result_string):
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    self.result_string = result_string
    self.tagged_text = self.tokenize_and_tag()

  def tokenize_and_tag(self):
    tagged_text = nltk.pos_tag(word_tokenize(self.result_string))
    return tagged_text

  def display_most_common_pos(self, n):
    pos_list = list(map(lambda x: x[1], self.tagged_text))
    pos_frequencies = Counter(pos_list)
    most_common_pos = pos_frequencies.most_common(n)
    print(f"Top {n} most common pos:")
    for pos, frequency in most_common_pos:
          print(f"{pos}: {frequency}")

input_string = 'Python is a very common programming language. I am a user of Python language. Many people who speak different languages are using Python right now.'
result_string = remove_punctuation(input_string)

text_analyzer = PoSStats(result_string)
text_analyzer.display_most_common_pos(3)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Top 3 most common pos:
NN: 5
NNP: 3
JJ: 3
