In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

In [2]:
#Input phrases
phrases = ["The car is clean and bright",
           "The car is old and good",
           "I love to have my kitchen clean",
           "I need to clean all the dishes"]

In [3]:
#convert to vector
vectorizer     = TfidfVectorizer()  
bow = vectorizer.fit_transform(phrases)
bow.shape
vectorizer.get_feature_names()
#browse to dense vectors
bow.todense()

matrix([[0.        , 0.41101031, 0.52131446, 0.41101031, 0.33274827,
         0.        , 0.        , 0.        , 0.41101031, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.33274827,
         0.        ],
        [0.        , 0.38144133, 0.        , 0.38144133, 0.        ,
         0.        , 0.48380996, 0.        , 0.38144133, 0.        ,
         0.        , 0.        , 0.        , 0.48380996, 0.30880963,
         0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.28462634,
         0.        , 0.        , 0.44592216, 0.        , 0.44592216,
         0.44592216, 0.44592216, 0.        , 0.        , 0.        ,
         0.35157015],
        [0.4747708 , 0.        , 0.        , 0.        , 0.30304005,
         0.4747708 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.4747708 , 0.        , 0.30304005,
         0.37431475]])

In [4]:
# compute td-idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_bow=tfidf_transformer.fit(bow)

# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=vectorizer.get_feature_names(),columns=["tf_idf_weights"])
 
# sort desc
tfidf_sorted =df_idf.sort_values(by=['tf_idf_weights'],ascending=False)
print (tfidf_sorted)


         tf_idf_weights
all            1.916291
bright         1.916291
dishes         1.916291
good           1.916291
have           1.916291
kitchen        1.916291
love           1.916291
my             1.916291
need           1.916291
old            1.916291
and            1.510826
car            1.510826
is             1.510826
to             1.510826
clean          1.223144
the            1.223144


In [5]:
#create SVD
svd= TruncatedSVD(n_components=2)
lsa = svd.fit_transform(bow)

In [6]:
import pandas as pd
#print (lsa)
topics_df = pd.DataFrame(lsa,columns=["topic_1","topic_2"])
topics_df["body"] = phrases
print (topics_df)

    topic_1   topic_2                             body
0  0.862029 -0.200194      The car is clean and bright
1  0.801410 -0.391736          The car is old and good
2  0.271234  0.765596  I love to have my kitchen clean
3  0.459537  0.606823   I need to clean all the dishes


In [7]:
dic = vectorizer.get_feature_names()

In [8]:
#looking at the word

word_matrix = pd.DataFrame(svd.components_,index=['topic1','topic2'],columns=dic).T
print(word_matrix)

           topic1    topic2
all      0.130636  0.250980
and      0.395183 -0.201851
bright   0.269079 -0.090917
car      0.395183 -0.201851
clean    0.301359  0.291998
dishes   0.130636  0.250980
good     0.232161 -0.165106
have     0.072421  0.297408
is       0.395183 -0.201851
kitchen  0.072421  0.297408
love     0.072421  0.297408
my       0.072421  0.297408
need     0.130636  0.250980
old      0.232161 -0.165106
the      0.403319 -0.003218
to       0.160093  0.432355


In [9]:
word_matrix['abs_topic1'] = np.abs(word_matrix["topic1"])
word_matrix['abs_topic2'] = np.abs(word_matrix["topic2"])

In [10]:
print (word_matrix.sort_values('abs_topic1',ascending=False))
#print (word_matrix)

           topic1    topic2  abs_topic1  abs_topic2
the      0.403319 -0.003218    0.403319    0.003218
and      0.395183 -0.201851    0.395183    0.201851
car      0.395183 -0.201851    0.395183    0.201851
is       0.395183 -0.201851    0.395183    0.201851
clean    0.301359  0.291998    0.301359    0.291998
bright   0.269079 -0.090917    0.269079    0.090917
good     0.232161 -0.165106    0.232161    0.165106
old      0.232161 -0.165106    0.232161    0.165106
to       0.160093  0.432355    0.160093    0.432355
all      0.130636  0.250980    0.130636    0.250980
dishes   0.130636  0.250980    0.130636    0.250980
need     0.130636  0.250980    0.130636    0.250980
have     0.072421  0.297408    0.072421    0.297408
kitchen  0.072421  0.297408    0.072421    0.297408
love     0.072421  0.297408    0.072421    0.297408
my       0.072421  0.297408    0.072421    0.297408
