# Latent Semantic Analysis

LSA is a technique of analysing relationships between a set of documents and the terms they contain by producing  a set of concepts
related to  the documents and terms

We use Singular value decomposition (SVD) property
Input decomposed to 3 matrix
![alt text](svd.png "Title")


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [4]:
dataset = [line.lower() for line in dataset] # convert all sentences to low case

In [5]:
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [6]:
# Creating Tfidf Model to get features names. it also gives which column corresponds to which word
# converts the dataset to tfidf model

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)


In [7]:
# Visualizing the Tfidf Model
print(X[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [8]:
# Creating the SVD  - decompose tfidf matrix to 3 matrix of SVD

lsa = TruncatedSVD(n_components = 4, n_iter = 100)   # n-components is no of concepts we need to find
lsa.fit(X)

TruncatedSVD(n_components=4, n_iter=100)

In [12]:
# First Column of V

col1 = lsa.components_[3]

In [13]:
col1  # we get values for all 42 words. high value words are in this concept and others are not in this concept

array([-2.39508717e-01,  5.64466475e-02,  2.32713384e-01,  5.38463701e-16,
       -2.39508717e-01,  2.32713384e-01,  3.71853138e-16, -4.43102940e-02,
       -1.27012912e-16,  4.65426768e-01, -2.39508717e-01,  3.71853138e-16,
        2.34583657e-02, -1.27012912e-16, -2.89978663e-01,  3.71853138e-16,
        5.64466475e-02,  2.32713384e-01,  2.34583657e-02,  2.12644552e-01,
       -1.09827021e-01,  3.71853138e-16, -1.27012912e-16,  2.34583657e-02,
        3.52290920e-02,  3.71853138e-16,  2.32713384e-01, -2.39508717e-01,
       -1.27012912e-16, -2.39508717e-01, -1.27012912e-16,  5.64466475e-02,
        3.71853138e-16, -1.79340346e-01,  1.27242132e-01,  1.95674223e-16,
       -2.39508717e-01,  5.64466475e-02,  3.71853138e-16, -1.09827021e-01,
        3.71853138e-16,  5.64466475e-02])

In [16]:
# Visualizing the concepts  - for each concept which are the important key words

terms = vectorizer.get_feature_names()
terms

['ai',
 'all',
 'amount',
 'and',
 'are',
 'by',
 'campaigns',
 'concert',
 'cook',
 'day',
 'examples',
 'global',
 'google',
 'gordon',
 'great',
 'have',
 'in',
 'increasing',
 'introducing',
 'is',
 'just',
 'launch',
 'love',
 'new',
 'of',
 'pollution',
 'polution',
 'present',
 'ramsay',
 'robots',
 'see',
 'singing',
 'stop',
 'technology',
 'the',
 'to',
 'today',
 'us',
 'warming',
 'was',
 'we',
 'were']

In [14]:
for i,comp in enumerate(lsa.components_):
    componentTerms = zip(terms,comp)  # zip of words and its values - we get tuples
    sortedTerms = sorted(componentTerms,key=lambda x:x[1],reverse=True)  # sorting the tuples of components
    sortedTerms = sortedTerms[:10] # 10 terms in specific concept
    print("\nConcept",i,":")
    for term in sortedTerms:
        print(term)


Concept 0 :
('the', 0.3760982952926375)
('concert', 0.34498873923306606)
('great', 0.3001240258948742)
('of', 0.29579806095266675)
('just', 0.23736582929791253)
('was', 0.23736582929791253)
('day', 0.2289215954150452)
('technology', 0.1838383456741344)
('all', 0.17824025175628952)
('in', 0.17824025175628952)

Concept 1 :
('to', 0.41578844396700687)
('cook', 0.2835916579351076)
('gordon', 0.2835916579351076)
('love', 0.2835916579351076)
('ramsay', 0.2835916579351076)
('see', 0.2835916579351076)
('and', 0.21730644711292435)
('campaigns', 0.21730644711292435)
('global', 0.21730644711292435)
('have', 0.21730644711292435)

Concept 2 :
('technology', 0.3779180676714397)
('is', 0.3419614380631988)
('google', 0.3413969441909746)
('introducing', 0.3413969441909746)
('new', 0.3413969441909746)
('day', 0.141124326809948)
('are', 0.11387892195373003)
('examples', 0.11387892195373003)
('present', 0.11387892195373003)
('robots', 0.11387892195373003)

Concept 3 :
('day', 0.46542676790411075)
('amoun