## Coding Exercise #0712

In [1]:
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition  import TruncatedSVD
warnings.filterwarnings('ignore')

### 1. Latent Semantic Analysis (LSA):

In [2]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

#### 1.1. Create a TF IDF representation:
TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br> 
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br> 

In [3]:
my_docs = [x.lower() for x in my_docs]

In [4]:
my_stop_words = ['us', 'like']

In [5]:
vectorizer = TfidfVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = stopwords.words('english') + my_stop_words)
X = vectorizer.fit_transform(my_docs).toarray()              

In [6]:
# Size of X (=m x n). m = number of documents = 7 & n = number of features.
X.shape

(7, 15)

In [7]:
# View the features.
features = vectorizer.get_feature_names()
print(features)

['announcing', 'awesome', 'cooking', 'economic', 'example', 'excited', 'food', 'movie', 'new', 'reverse', 'samsung', 'severe', 'simply', 'slowdown', 'technology']


#### 1.2. Apply the truncated SVD:

In [8]:
n_topics = 4
svd = TruncatedSVD(n_components=n_topics, n_iter=100)
svd.fit(X)

TruncatedSVD(n_components=4, n_iter=100)

In [9]:
# get the V^t matrix. 
vt = svd.components_
vtabs = np.abs(vt)

In [10]:
# Check for the size of V^t. 
vt.shape

(4, 15)

In [11]:
vt

array([[ 5.62288440e-17,  9.50380366e-18, -2.23871986e-18,
         6.05710899e-01,  9.75226714e-18,  9.62765708e-19,
        -2.23871986e-18,  2.35367156e-18,  4.27290993e-17,
         3.64848333e-01,  4.27290993e-17,  3.64848333e-01,
         1.94918609e-18,  6.05710899e-01,  4.34982063e-17],
       [ 1.09328020e-01,  5.24120449e-01, -4.92767842e-17,
         9.37746298e-18,  2.79641926e-01,  3.00367284e-01,
        -4.92767842e-17,  5.41324276e-01,  1.09328020e-01,
         1.69334126e-17,  1.09328020e-01, -1.55329901e-17,
         3.51763163e-01,  9.37746298e-18,  3.22878460e-01],
       [ 3.17757779e-01,  1.09242359e-01, -2.11112265e-15,
        -2.33185786e-17,  2.84805383e-01, -3.73322920e-01,
        -2.11227461e-15, -4.37060653e-01,  3.17757779e-01,
        -7.57931141e-17,  3.17757779e-01,  3.87128567e-17,
        -1.53201700e-01, -2.33185919e-17,  5.00179172e-01],
       [ 6.49154587e-16,  4.20169827e-16,  7.07106781e-01,
         1.60752623e-17,  1.05982378e-15, -9.84630874

#### 1.3. From each topic, extract the top features:

In [None]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-vtabs[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비!
    else:
        topic_matrix.append(topic_features_top) 

In [None]:
# Show the top features for each topic.
topic_matrix

In [None]:
# In view of the top features, we can name the topics.
topic_names = ['Economy', 'Movie','Technology', 'Cuisine']

#### 1.4. Label each document with the most predominant topic:

In [None]:
n_docs = len(my_docs)
for i in range(n_docs):
    score_pick = 0
    topic_pick = 0
    tokennized_doc = nltk.word_tokenize(my_docs[i])
    for j in range(n_topics):
        found = [ x in topic_matrix[j] for x in tokennized_doc ] 
        score = np.sum(found)
        if (score > score_pick):
            score_pick = score
            topic_pick = j
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

**NOTE**: We can notice some inaccuracies.