In [1]:
import re

txt = "Genome Biology covers all areas of biology and biomedicine studied from a genomic and post-genomic perspective. Content includes research, new methods and software tools, and reviews, opinions and commentaries. Areas covered include, but are not limited to: sequence analysis; bioinformatics; insights into molecular, cellular and organismal biology; functional genomics; epigenomics; population genomics; proteomics; comparative biology and evolution; systems and network biology; genome editing and engineering; genomics of disease; and clinical genomics."


In [2]:
from nltk.tokenize import word_tokenize

# Tokenizing the text
tokens = word_tokenize(txt)
print(tokens)


['Genome', 'Biology', 'covers', 'all', 'areas', 'of', 'biology', 'and', 'biomedicine', 'studied', 'from', 'a', 'genomic', 'and', 'post-genomic', 'perspective', '.', 'Content', 'includes', 'research', ',', 'new', 'methods', 'and', 'software', 'tools', ',', 'and', 'reviews', ',', 'opinions', 'and', 'commentaries', '.', 'Areas', 'covered', 'include', ',', 'but', 'are', 'not', 'limited', 'to', ':', 'sequence', 'analysis', ';', 'bioinformatics', ';', 'insights', 'into', 'molecular', ',', 'cellular', 'and', 'organismal', 'biology', ';', 'functional', 'genomics', ';', 'epigenomics', ';', 'population', 'genomics', ';', 'proteomics', ';', 'comparative', 'biology', 'and', 'evolution', ';', 'systems', 'and', 'network', 'biology', ';', 'genome', 'editing', 'and', 'engineering', ';', 'genomics', 'of', 'disease', ';', 'and', 'clinical', 'genomics', '.']


In [3]:
from nltk import pos_tag, ne_chunk

# Perform part-of-speech tagging
pos_tags = pos_tag(tokens)

# Perform named entity recognition
named_entities = ne_chunk(pos_tags)
print(named_entities)

(S
  (PERSON Genome/NNP)
  (ORGANIZATION Biology/NNP)
  covers/VBZ
  all/DT
  areas/NNS
  of/IN
  biology/NN
  and/CC
  biomedicine/NN
  studied/VBN
  from/IN
  a/DT
  genomic/JJ
  and/CC
  post-genomic/JJ
  perspective/NN
  ./.
  Content/NNP
  includes/VBZ
  research/NN
  ,/,
  new/JJ
  methods/NNS
  and/CC
  software/NN
  tools/NNS
  ,/,
  and/CC
  reviews/NNS
  ,/,
  opinions/NNS
  and/CC
  commentaries/NNS
  ./.
  (PERSON Areas/NNP)
  covered/VBD
  include/NN
  ,/,
  but/CC
  are/VBP
  not/RB
  limited/VBN
  to/TO
  :/:
  sequence/NN
  analysis/NN
  ;/:
  bioinformatics/NNS
  ;/:
  insights/NNS
  into/IN
  molecular/JJ
  ,/,
  cellular/JJ
  and/CC
  organismal/JJ
  biology/NN
  ;/:
  functional/JJ
  genomics/NNS
  ;/:
  epigenomics/NNS
  ;/:
  population/NN
  genomics/NNS
  ;/:
  proteomics/NNS
  ;/:
  comparative/JJ
  biology/NN
  and/CC
  evolution/NN
  ;/:
  systems/NNS
  and/CC
  network/NN
  biology/NN
  ;/:
  genome/CC
  editing/NN
  and/CC
  engineering/NN
  ;/:
  genomics/N

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Extracting keywords using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([txt])
keywords = vectorizer.get_feature_names_out()
print(keywords)


['all' 'analysis' 'and' 'are' 'areas' 'bioinformatics' 'biology'
 'biomedicine' 'but' 'cellular' 'clinical' 'commentaries' 'comparative'
 'content' 'covered' 'covers' 'disease' 'editing' 'engineering'
 'epigenomics' 'evolution' 'from' 'functional' 'genome' 'genomic'
 'genomics' 'include' 'includes' 'insights' 'into' 'limited' 'methods'
 'molecular' 'network' 'new' 'not' 'of' 'opinions' 'organismal'
 'perspective' 'population' 'post' 'proteomics' 'research' 'reviews'
 'sequence' 'software' 'studied' 'systems' 'to' 'tools']


In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Analyzing sentiment
sia = SentimentIntensityAnalyzer()
sentiment_score = sia.polarity_scores(txt)
print(sentiment_score)


{'neg': 0.0, 'neu': 0.972, 'pos': 0.028, 'compound': 0.2498}


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Topic modeling using Latent Dirichlet Allocation (LDA)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([txt])

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
topics = lda_model.fit_transform(tfidf_matrix)
print(topics)


[[0.03353158 0.86587369 0.03353158 0.03353158 0.03353158]]
