# IMDB 영화 데이터셋 LSA

## 1.데이터 로드

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# CSV 파일을 불러와 DataFrame으로 변환
imdb_df = pd.read_csv('imdb.csv')  # 파일 경로에 맞게 변경
print(imdb_df.shape)

# 상위 1000개의 리뷰만 사용
imdb_df = imdb_df.head(1000)

# 첫 번째 리뷰 출력
print("첫 번째 리뷰:")
print(imdb_df['Description'][0])  # 첫 번째 리뷰 출력
print('리뷰 개수:', len(imdb_df))
imdb_df.info()

(120000, 3)
첫 번째 리뷰:
Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
리뷰 개수: 1000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  1000 non-null   int64 
 1   Title        1000 non-null   object
 2   Description  1000 non-null   object
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [9]:
imdb_df[['Title', 'Description']]

Unnamed: 0,Title,Description
0,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...
995,U.S. Stocks Rebound as Oil Prices Ease,NEW YORK (Reuters) - U.S. stocks rebounded on...
996,Dollar Rises Vs Euro After Asset Data,NEW YORK (Reuters) - The dollar gained agains...
997,Bikes Bring Internet to Indian Villagers (AP),"AP - For 12-year-old Anju Sharma, hope for a b..."
998,Celebrity Chefs Are Everywhere in Vegas,By ADAM GOLDMAN LAS VEGAS (AP) -- The waite...


In [11]:
print(imdb_df[['Title', 'Description']])

                                                 Title  \
0    Wall St. Bears Claw Back Into the Black (Reuters)   
1    Carlyle Looks Toward Commercial Aerospace (Reu...   
2      Oil and Economy Cloud Stocks' Outlook (Reuters)   
3    Iraq Halts Oil Exports from Main Southern Pipe...   
4    Oil prices soar to all-time record, posing new...   
..                                                 ...   
995             U.S. Stocks Rebound as Oil Prices Ease   
996              Dollar Rises Vs Euro After Asset Data   
997      Bikes Bring Internet to Indian Villagers (AP)   
998            Celebrity Chefs Are Everywhere in Vegas   
999   Entertainment World Wary of Microsoft Technology   

                                           Description  
0    Reuters - Short-sellers, Wall Street's dwindli...  
1    Reuters - Private investment firm Carlyle Grou...  
2    Reuters - Soaring crude prices plus worries\ab...  
3    Reuters - Authorities have halted oil export\f...  
4    AFP - Tearawa

## 2.전처리

In [18]:
# 특수 문자 재고 맟 전처리
imdb_df['clean_doc'] = imdb_df['Description'].str.replace("[^a-zA-Z]"," ",regex=True)    # regex = regular expression

# 길이가 3 이하인 단어는 제거
imdb_df['clean_doc'] = imdb_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

# 대소문자 정규화
imdb_df['clean_doc'] = imdb_df['clean_doc'].apply(lambda x: x.lower())

print(imdb_df['clean_doc'].head())


0    reuters short sellers wall street dwindling ba...
1    reuters private investment firm carlyle group ...
2    reuters soaring crude prices plus worries abou...
3    reuters authorities have halted export flows f...
4    tearaway world prices toppling records straini...
Name: clean_doc, dtype: object


## 3.TF-IDF 벡터화

In [16]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf = True)

X = vectorizer.fit_transform(imdb_df['clean_doc'])

X.shape

(1000, 1000)

## 4.LSA 적용

In [19]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=42)
svd_model.fit(X)

print(svd_model)

TruncatedSVD(n_components=20, n_iter=100, random_state=42)


## 5.토픽 확인

In [23]:
# 문서별 가장 관련성이 높은 (비중이 높은) 토픽 상위 n개를 추출하는 함수 생성
terms = vectorizer.get_feature_names_out()

def get_document_topics(lsa_matrix, n=5):
    for idx, doc in enumerate(lsa_matrix):
        if idx == 10: break
        print(f"Document {idx+1}:", [(f"Topic {i+1}", doc[i].round(5)) for i in doc.argsort()[::-1][:n]])

# 문서별 토픽 비율 확인
lsa_matrix = svd_model.transform(X)

# 함수 호출
get_document_topics(lsa_matrix, n=5)

Document 1: [('Topic 1', 0.10501), ('Topic 4', 0.07552), ('Topic 8', 0.06529), ('Topic 10', 0.04661), ('Topic 18', 0.04259)]
Document 2: [('Topic 1', 0.14318), ('Topic 20', 0.06137), ('Topic 4', 0.05243), ('Topic 15', 0.04796), ('Topic 8', 0.04223)]
Document 3: [('Topic 8', 0.31316), ('Topic 6', 0.19527), ('Topic 11', 0.1857), ('Topic 1', 0.18284), ('Topic 4', 0.15127)]
Document 4: [('Topic 1', 0.17524), ('Topic 5', 0.13552), ('Topic 12', 0.1175), ('Topic 4', 0.05847), ('Topic 18', 0.04912)]
Document 5: [('Topic 15', 0.12098), ('Topic 8', 0.11785), ('Topic 1', 0.10466), ('Topic 14', 0.08082), ('Topic 20', 0.06878)]
Document 6: [('Topic 1', 0.17878), ('Topic 8', 0.1302), ('Topic 10', 0.09047), ('Topic 4', 0.06776), ('Topic 20', 0.05736)]
Document 7: [('Topic 1', 0.17361), ('Topic 15', 0.16632), ('Topic 12', 0.09141), ('Topic 6', 0.07011), ('Topic 13', 0.06996)]
Document 8: [('Topic 1', 0.1635), ('Topic 15', 0.12911), ('Topic 20', 0.10043), ('Topic 4', 0.08301), ('Topic 13', 0.07571)]
Do

In [24]:
# 토픽과 가장 관련성이 높은 상위 n개의 단어만 뽑아주는 함수 생성

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:"%(idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[::-1][:n]])

# 각 토픽별 상위 5개 단어 출력
get_topics(svd_model.components_, terms)

Topic 1: [('reuters', 0.36062), ('monday', 0.28934), ('said', 0.25494), ('sunday', 0.22402), ('athens', 0.17717)]
Topic 2: [('athens', 0.41798), ('olympic', 0.30958), ('sunday', 0.25798), ('team', 0.23184), ('phelps', 0.17064)]
Topic 3: [('president', 0.27886), ('chavez', 0.22478), ('hugo', 0.22478), ('referendum', 0.22148), ('venezuela', 0.21818)]
Topic 4: [('monday', 0.28527), ('reuters', 0.23932), ('profit', 0.15149), ('sales', 0.12486), ('billion', 0.11989)]
Topic 5: [('space', 0.29037), ('said', 0.24735), ('people', 0.11468), ('scientists', 0.11404), ('national', 0.10832)]
Topic 6: [('space', 0.34336), ('google', 0.32354), ('public', 0.15628), ('search', 0.14791), ('auction', 0.13344)]
Topic 7: [('space', 0.39243), ('nasa', 0.13092), ('earth', 0.12246), ('telescope', 0.10664), ('software', 0.10635)]
Topic 8: [('york', 0.22706), ('prices', 0.2218), ('hurricane', 0.21618), ('florida', 0.18107), ('charley', 0.16682)]
Topic 9: [('hurricane', 0.24158), ('charley', 0.22961), ('florida',