In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk

In [2]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

dataset = [line.lower() for line in dataset]

In [3]:
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [4]:
# Creating Tfidf Model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Visualizing the Tfidf Model
print(X[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [6]:
# Creating the SVD
lsa = TruncatedSVD(n_components = 4, n_iter = 100)
lsa.fit(X)


# First Column of V
row1 = lsa.components_[3]
row1

array([-2.39508717e-01,  5.64466475e-02,  2.32713384e-01, -2.33390782e-16,
       -2.39508717e-01,  2.32713384e-01, -2.75805729e-16, -4.43102940e-02,
       -2.47452842e-16,  4.65426768e-01, -2.39508717e-01, -2.75805729e-16,
        2.34583657e-02, -2.47452842e-16, -2.89978663e-01, -2.75805729e-16,
        5.64466475e-02,  2.32713384e-01,  2.34583657e-02,  2.12644552e-01,
       -1.09827021e-01, -2.75805729e-16, -2.47452842e-16,  2.34583657e-02,
        3.52290920e-02, -2.75805729e-16,  2.32713384e-01, -2.39508717e-01,
       -2.47452842e-16, -2.39508717e-01, -2.47452842e-16,  5.64466475e-02,
       -2.75805729e-16, -1.79340346e-01,  1.27242132e-01, -4.36622216e-16,
       -2.39508717e-01,  5.64466475e-02, -2.75805729e-16, -1.09827021e-01,
       -2.75805729e-16,  5.64466475e-02])

In [7]:
# Word Concept Dictionary Creation
concept_words = {}

In [8]:
# Visualizing the concepts
terms = vectorizer.get_feature_names()
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)
    sortedTerms = sortedTerms[:10]
    concept_words["Concept "+str(i)] = sortedTerms
    

In [9]:
concept_words  # words in each concept and its value

{'Concept 0': [('the', 0.3760982952926378),
  ('concert', 0.34498873923306633),
  ('great', 0.300124025894874),
  ('of', 0.2957980609526666),
  ('just', 0.2373658292979127),
  ('was', 0.2373658292979127),
  ('day', 0.2289215954150454),
  ('technology', 0.18383834567413385),
  ('all', 0.1782402517562896),
  ('in', 0.1782402517562896)],
 'Concept 1': [('to', 0.41578844396700704),
  ('cook', 0.28359165793510804),
  ('gordon', 0.28359165793510804),
  ('love', 0.28359165793510804),
  ('ramsay', 0.28359165793510804),
  ('see', 0.28359165793510804),
  ('and', 0.2173064471129242),
  ('campaigns', 0.2173064471129242),
  ('global', 0.2173064471129242),
  ('have', 0.2173064471129242)],
 'Concept 2': [('technology', 0.37791806767144004),
  ('is', 0.3419614380631987),
  ('google', 0.341396944190975),
  ('introducing', 0.341396944190975),
  ('new', 0.341396944190975),
  ('day', 0.14112432680994696),
  ('are', 0.11387892195372991),
  ('examples', 0.11387892195372991),
  ('present', 0.1138789219537299

In [10]:
# Sentence Concepts  -- gives scores for all sentences in the concept

for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0
        for word in words:
            for word_with_score in concept_words[key]:
                if word == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print("\n"+key+":")
    for sentence_score in sentence_scores:
        print(sentence_score)


Concept 0:
1.1297395470753953
1.4959427190164036
0
0.18383834567413385
0.7797604325216744
1.37336559899095
0

Concept 1:
0
0
1.8337467336425475
0
0
0
1.285014232418704

Concept 2:
0.6242100916830926
0
0
1.7440703383075635
0.8334337554863598
0
0

Concept 3:
2.201593755447889
0.1272421318069433
0
0.21264455202450017
0
0.2965820743887383
0


In [None]:
# we see that for concept 0, the 2nd sentence is most important