In [None]:
import numpy as np

In [None]:
A = np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
print(np.shape(A))

In [None]:
U, s, VT = np.linalg.svd(A, full_matrices = True)
print(U.round(2))
print(np.shape(U))

In [None]:
s.round(2)

In [None]:
print(np.diag(s)) # 대각행렬로 변경시켜줌

In [None]:
S = np.zeros((4, 9))
S[:4, :4] = np.diag(s)

print(S.round(2))

In [None]:
print(VT)

In [None]:
print(U)

## 뉴스 그룹 데이터 SVD 적용

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
dataset = fetch_20newsgroups(shuffle = True, random_state = 1, remove = ('headers', 'footers', 'quotes'))
documents = dataset.data
print('샘플 의 수 : ', len(documents))

In [None]:
documents[1]

In [None]:
print(dataset.target_names)

In [None]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [None]:
news_df['clean_doc'][1]

In [None]:
import nltk
nltk.download('stopwords')

# NLTK로부터 불용어를 받아온다.
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

In [None]:
print(tokenized_doc[1])

## TF_IDF 만들기

In [None]:
# 토큰을 하나로 합치키

detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

In [None]:
news_df['clean_doc'] = detokenized_doc

In [None]:
news_df['clean_doc'][1]

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000, max_df = 0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

print(X.shape)

In [None]:
# topic modeling

svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

In [None]:
np.shape(svd_model.components_)

In [None]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=20):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)