In [28]:
import pandas as pd 
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
import re
import string

def clean_text(s):
    """
    This function cleans the text a bit
    :param s: string
    :return: cleaned string
    """
    # split by all whitespaces
    s = s.split()
    # join with a single space eliminate lots of blank spaces
    s = " ".join(s)
    # remove all punctuations using regex and string module
    s = re.sub(f"[{re.escape(string.punctuation)}]",'',s)
    return s

In [30]:
corpus = pd.read_csv("../input/imdb.csv", nrows = 10000)
corpus.loc[:,"review"] = corpus.review.apply(clean_text)
corpus = corpus.review.values

In [31]:
tfv = TfidfVectorizer(tokenizer= word_tokenize, token_pattern= None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)

In [32]:
#initialize svd with 10 components
svd = decomposition.TruncatedSVD(n_components=10)

In [33]:
#fit svd
corpus_svd = svd.fit(corpus_transformed)

In [34]:
# choose first sample and create a dictionary
# of feature names and their scores from svd
# you can change the sample_index variable to
# get dictionary for any other sample
sample_index = 0
feature_scores = dict( zip( tfv.get_feature_names(), corpus_svd.components_[sample_index] ) )

In [35]:
# once we have the dictionary, we can now
# sort it in decreasing order and get the
# top N topics
N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', 'a', 'and', 'of', 'to']


In [36]:
N = 10
for sample_index in range(5):
    feature_scores = dict( zip( tfv.get_feature_names(), corpus_svd.components_[sample_index] ) )
    print( sorted( feature_scores, key=feature_scores.get, reverse=True )[:N] )

['the', 'a', 'and', 'of', 'to', 'is', 'i', 'in', 'it', 'this']
['i', 'movie', 'it', 'was', 'this', 'you', 'my', 'me', 'have', 'watch']
['the', 'was', 'i', 'were', 'of', 'book', 'had', 'series', 'first', 'did']
['her', 'was', 'she', 'i', 'he', 'his', 'and', 'him', 'to', 'in']
['br', 'to', 'they', 'he', 'show', 'itbr', 'no', 'have', 'were', 'you']


In [37]:
x = "Texto, de .c. prueba    cd. "
clean_text(x)

'Texto de c prueba cd'