In [38]:
from newspaper import Article

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.tag import pos_tag

from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /Users/ihobbang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ihobbang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ihobbang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/ihobbang/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

## 1. 뉴스 기사 크롤링

In [56]:
# 기사 다운로드 및 텍스트 파싱
url = 'https://finance.yahoo.com/news/exclusive-unilever-launches-bid-sell-133434534.html'

article = Article(url)
article.download()
article.parse()

text = article.text

In [57]:
print(text)

By Abigail Summerville

(Reuters) - Unilever Plc has hired investment banks Morgan Stanley and Evercore Inc to sell a basket of non-core beauty and personal care brands that include Q-Tips and Impulse, reviving an effort it abandoned two years ago, according to people familiar with the matter.

The revival of the sale process, which has not been previously reported, represents the first major move by Hein Schumacher, who took over as Unilever's chief executive in July with a focus to streamline its business as it grapples with inflation.

The brand portfolio, known as Elida Beauty, also includes Caress, TIGI, Timotei, Monsavon, St. Ives, Zwitsal, Ponds, Brut, Moussel, Alberto Balsam and Matey. Elida generated about $760 million in revenue in 2022, according to the sources.

Unilever worked with Credit Suisse in 2021 to divest Elida but pulled the process later that year, after cherry-picking of the brands for sale by other consumer companies led to offers that did not meet its valuatio

## 2. 텍스트 데이터 토큰화

In [66]:
# 단어 토큰화
words = word_tokenize(text)

# 불용어(stopwords) 제거
stop_words = set(stopwords.words('english'))
filtered_tokens = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

# 각 단어의 빈도 계산
freq_dist = FreqDist(filtered_tokens)

for word, frequency in freq_dist.items():
    print(f'{word}: {frequency}')

abigail: 2
summerville: 2
reuters: 1
unilever: 7
plc: 1
hired: 1
investment: 1
banks: 1
morgan: 3
stanley: 3
evercore: 3
inc: 1
sell: 2
basket: 1
beauty: 2
personal: 1
care: 1
brands: 3
include: 1
impulse: 1
reviving: 1
effort: 1
abandoned: 1
two: 2
years: 2
ago: 1
according: 3
people: 1
familiar: 1
matter: 2
revival: 1
sale: 2
process: 2
previously: 1
reported: 1
represents: 1
first: 1
major: 1
move: 1
hein: 1
schumacher: 1
took: 1
chief: 1
executive: 1
july: 1
focus: 1
streamline: 1
business: 1
grapples: 1
inflation: 1
brand: 1
portfolio: 2
known: 1
elida: 5
also: 3
includes: 1
caress: 1
tigi: 1
timotei: 1
monsavon: 1
ives: 1
zwitsal: 1
ponds: 1
brut: 1
moussel: 1
alberto: 1
balsam: 1
matey: 1
generated: 1
760: 1
million: 1
revenue: 1
2022: 1
sources: 5
worked: 2
credit: 1
suisse: 1
2021: 1
divest: 1
pulled: 1
later: 1
year: 1
consumer: 2
companies: 1
led: 1
offers: 1
meet: 1
valuation: 1
expectations: 1
said: 1
since: 1
make: 1
autonomous: 1
unit: 1
could: 2
appeal: 1
entirety: 1
pr

## 3. 상위 빈도 10개 단어

In [67]:
# 빈도가 가장 높은 상위 단어 출력 (예: 상위 10개)
most_common_words = freq_dist.most_common(10)

for word, frequency in most_common_words:
    print(f'{word}: {frequency}')

unilever: 7
elida: 5
sources: 5
morgan: 3
stanley: 3
evercore: 3
brands: 3
according: 3
also: 3
abigail: 2


## 4. 뉴스 기사 단어 수 계산

In [68]:
word_count = len(words)

# 결과 출력
print(f"뉴스 기사의 길이 (단어 수): {word_count}")

뉴스 기사의 길이 (단어 수): 390


## 5. 문장의 평균 길이 계산

In [59]:
# 문장 토큰화
sentences = sent_tokenize(text)

# 각 문장의 길이 측정
sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]

# 평균 문장 길이 계산
average_sentence_length = sum(sentence_lengths) / len(sentence_lengths)

print(f"문장의 평균 길이: {average_sentence_length}")

문장의 평균 길이: 27.857142857142858


## 6. 긍부정 단어 빈도 계산

In [60]:
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

# 긍정적인 단어와 부정적인 단어의 빈도 계산
positive_word_count = sum(1 for word in words if word in positive_words)
negative_word_count = sum(1 for word in words if word in negative_words)

# 결과 출력
print(f"긍정적인 단어 빈도: {positive_word_count}")
print(f"부정적인 단어 빈도: {negative_word_count}")

긍정적인 단어 빈도: 7
부정적인 단어 빈도: 2


## 7. 최고빈도 명사 추출

In [61]:
# 텍스트에서 명사 추출
def extract_nouns(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    nouns = [word for word, tag in tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]  # 명사 태그를 포함하는 단어 추출
    return nouns

nouns = extract_nouns(text)

# 명사 빈도 계산
noun_freq = Counter(nouns)

# 가장 많이 등장하는 명사 출력 
most_common_nouns = noun_freq.most_common(1)

for noun, freq in most_common_nouns:
    print(f'{noun}: {freq}')

Unilever: 7


## 8. 텍스트 요약

In [62]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [63]:
print(summarizer(text, max_length=100, min_length=30, do_sample=False))

[{'summary_text': "Unilever has hired investment banks Morgan Stanley and Evercore Inc to sell a basket of non-core beauty and personal care brands. The revival of the sale process represents the first major move by Hein Schumacher, who took over as Unilever's chief executive."}]
