In [1]:
review = "An associated ClaimReview, related by specific common content, topic or claim. The expectation is that this property would be most typically used in cases where a single activity is conducting both claim reviews and media reviews, in which case relatedMediaReview would commonly be used on a ClaimReview, while relatedClaimReview would be used on MediaReview"

In [3]:
review = review.lower()

review

'an associated claimreview, related by specific common content, topic or claim. the expectation is that this property would be most typically used in cases where a single activity is conducting both claim reviews and media reviews, in which case relatedmediareview would commonly be used on a claimreview, while relatedclaimreview would be used on mediareview'

In [4]:
import string

review = review.translate(str.maketrans('','',string.punctuation))

print(review)

an associated claimreview related by specific common content topic or claim the expectation is that this property would be most typically used in cases where a single activity is conducting both claim reviews and media reviews in which case relatedmediareview would commonly be used on a claimreview while relatedclaimreview would be used on mediareview


In [6]:
from nltk import word_tokenize

token = word_tokenize(review)

print(token)

['an', 'associated', 'claimreview', 'related', 'by', 'specific', 'common', 'content', 'topic', 'or', 'claim', 'the', 'expectation', 'is', 'that', 'this', 'property', 'would', 'be', 'most', 'typically', 'used', 'in', 'cases', 'where', 'a', 'single', 'activity', 'is', 'conducting', 'both', 'claim', 'reviews', 'and', 'media', 'reviews', 'in', 'which', 'case', 'relatedmediareview', 'would', 'commonly', 'be', 'used', 'on', 'a', 'claimreview', 'while', 'relatedclaimreview', 'would', 'be', 'used', 'on', 'mediareview']


In [7]:
from nltk.corpus import stopwords

remove_stopword = set(stopwords.words('english'))

clean = [w for w in token if w not in remove_stopword]

print(clean)

['associated', 'claimreview', 'related', 'specific', 'common', 'content', 'topic', 'claim', 'expectation', 'property', 'would', 'typically', 'used', 'cases', 'single', 'activity', 'conducting', 'claim', 'reviews', 'media', 'reviews', 'case', 'relatedmediareview', 'would', 'commonly', 'used', 'claimreview', 'relatedclaimreview', 'would', 'used', 'mediareview']


In [9]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()

text1 = [stemmer.stem(w) for w in clean]

text2 = [lemmatizer.lemmatize(w) for w in clean]

print(text1)
print(text2)

['associ', 'claimreview', 'relat', 'specif', 'common', 'content', 'topic', 'claim', 'expect', 'properti', 'would', 'typic', 'use', 'case', 'singl', 'activ', 'conduct', 'claim', 'review', 'media', 'review', 'case', 'relatedmediareview', 'would', 'commonli', 'use', 'claimreview', 'relatedclaimreview', 'would', 'use', 'mediareview']
['associated', 'claimreview', 'related', 'specific', 'common', 'content', 'topic', 'claim', 'expectation', 'property', 'would', 'typically', 'used', 'case', 'single', 'activity', 'conducting', 'claim', 'review', 'medium', 'review', 'case', 'relatedmediareview', 'would', 'commonly', 'used', 'claimreview', 'relatedclaimreview', 'would', 'used', 'mediareview']


In [10]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = CountVectorizer()
tfidf = TfidfVectorizer()

In [12]:
text1_join = ' '.join(text1)

bow = vectorizer.fit_transform([text1_join])

print(vectorizer.get_feature_names_out())
print(bow.toarray())

['activ' 'associ' 'case' 'claim' 'claimreview' 'common' 'commonli'
 'conduct' 'content' 'expect' 'media' 'mediareview' 'properti' 'relat'
 'relatedclaimreview' 'relatedmediareview' 'review' 'singl' 'specif'
 'topic' 'typic' 'use' 'would']
[[1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 3]]


In [13]:
text2_join = ' '.join(text2)

matrix = tfidf.fit_transform([text2_join])

print(tfidf.get_feature_names_out())
print(matrix.toarray())

['activity' 'associated' 'case' 'claim' 'claimreview' 'common' 'commonly'
 'conducting' 'content' 'expectation' 'mediareview' 'medium' 'property'
 'related' 'relatedclaimreview' 'relatedmediareview' 'review' 'single'
 'specific' 'topic' 'typically' 'used' 'would']
[[0.14002801 0.14002801 0.28005602 0.28005602 0.28005602 0.14002801
  0.14002801 0.14002801 0.14002801 0.14002801 0.14002801 0.14002801
  0.14002801 0.14002801 0.14002801 0.14002801 0.28005602 0.14002801
  0.14002801 0.14002801 0.14002801 0.42008403 0.42008403]]
