In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD

from nltk.corpus import stopwords

In [5]:
text1="It is a very small window."
text2="The small birds were singing softly."
text3="The vase had a very small neck, and spread out at the top like a bowl."
text4="There are three small rooms up stairs."
text5="A small plastic bag fell out of the pocket."
text6="The house has but two small second story bedrooms."
data = pd.DataFrame()
data["sentences"] = [text1, text2, text3, text4,text5,text6]
print(data)

                                           sentences
0                         It is a very small window.
1               The small birds were singing softly.
2  The vase had a very small neck, and spread out...
3             There are three small rooms up stairs.
4        A small plastic bag fell out of the pocket.
5  The house has but two small second story bedro...


In [6]:
data['clean_sentences'] = data['sentences'].str.replace("[^a-zA-Z#]", " ")
data['clean_sentences'] = data['clean_sentences'].fillna('').apply(lambda x: ' '.join([i for i in x.split() if len(i)>2]))
data['clean_sentences'] = data['clean_sentences'].fillna('').apply(lambda x: x.lower())

data.head()

  data['clean_sentences'] = data['sentences'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,sentences,clean_sentences
0,It is a very small window.,very small window
1,The small birds were singing softly.,the small birds were singing softly
2,"The vase had a very small neck, and spread out...",the vase had very small neck and spread out th...
3,There are three small rooms up stairs.,there are three small rooms stairs
4,A small plastic bag fell out of the pocket.,small plastic bag fell out the pocket


In [7]:
stop_words = stopwords.words('english')

In [8]:
tokenized, detokenized = data['clean_sentences'].fillna('').apply(lambda x: x.split()), []

tokenized = tokenized.apply(lambda x: [i for i in x if i not in stop_words]) 

for i in range(len(data)):
    m = ' '.join(tokenized[i])
    detokenized.append(m)

data['clean_sentences'] = detokenized
data.head()

Unnamed: 0,sentences,clean_sentences
0,It is a very small window.,small window
1,The small birds were singing softly.,small birds singing softly
2,"The vase had a very small neck, and spread out...",vase small neck spread top like bowl
3,There are three small rooms up stairs.,three small rooms stairs
4,A small plastic bag fell out of the pocket.,small plastic bag fell pocket


In [9]:
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(data['clean_sentences'])
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40572238, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.91399636],
       [0.        , 0.        , 0.55927514, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.55927514, 0.24826187, 0.55927514,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.4386536 , 0.        ,
        0.        , 0.4386536 , 0.4386536 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.19471804, 0.        ,
        0.4386536 , 0.        , 0.        , 0.4386536 , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.67465286, 0.        , 0.        , 0

In [10]:
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

In [11]:
dictionary = vectorizer.get_feature_names()
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [12]:
encoding_matrix


Unnamed: 0,topic_1,topic_2
bag,0.150999,-0.147089
bedrooms,0.150999,-0.147089
birds,0.191217,-0.06219
bowl,0.124759,0.40281
fell,0.150999,-0.147089
house,0.150999,-0.147089
like,0.124759,0.40281
neck,0.124759,0.40281
plastic,0.150999,-0.147089
pocket,0.150999,-0.147089
