# 요약할 텍스트 로드

In [32]:
import urllib.request
import pandas as pd
import numpy as np
urllib.request.urlretrieve("https://raw.githubusercontent.com/sunnysai12345/News_Summary/master/news_summary_more.csv", filename="news_summary_more.csv")
data = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1')

data.sample(10)

Unnamed: 0,headlines,text
77669,Lalu Prasad calls Nitish Kumar 'palturam' of I...,"Talking about Bihar CM Nitish Kumar, RJD chief..."
89894,United made a terrible mistake in dragging fli...,"Billionaire Warren Buffett, the top investor i..."
10353,4 minor girls flee Bihar shelter home using du...,The police on Sunday said that four minor girl...
14842,Scientists 3D-print cement paste that gets str...,Researchers at USA's Purdue University have 3D...
82880,"First look poster of Sidharth, Manoj's 'Aiyaar...",The first look poster of the Sidharth Malhotra...
36829,Virgin Galactic finishes 1st powered flight si...,Richard Branson-led commercial spaceflight Vir...
7343,UpGrad announces scholarship worth Ã¢ÂÂ¹5 Cr ...,UpGrad has announced a scholarship worth Ã¢ÂÂ...
37657,Xiaomi cuts valuation to $55-70 billion ahead ...,Chinese smartphone maker Xiaomi has reportedly...
57603,BHU exam paper asks to explain GST in Kautilya...,Students in Banaras Hindu University were aske...
16320,1 in 20 deaths worldwide caused due to alcohol...,Alcohol is responsible for over three million ...


# 추출적 요약 Extractive Summaization

In [49]:
from summa.summarizer import summarize

print("headlines : ",data.iloc[0, 0])
print("extractive summarization : ", summarize(data.iloc[0, 1], ratio=0.4))

headlines :  upGrad learner switches to career in ML & Al with 90% salary hike
extractive summarization :  upGrad's Online Power Learning has powered 3 lakh+ careers.


In [60]:
res_ext = data["text"].apply(lambda x : summarize(x, ratio=0.4))
res_ext = res_ext.rename("extractive_summary")
res_ext

0        upGrad's Online Power Learning has powered 3 l...
1        Users get one CRED coin per rupee of bill paid...
2        The match witnessed India getting all out for ...
3        Also, customers have options to insure against...
4        Speaking about the sexual harassment allegatio...
                               ...                        
98396    A CRPF jawan was on Tuesday axed to death with...
98397    'Uff Yeh', the first song from the Sonakshi Si...
98398    Michael B Jordan will reportedly play the lead...
98399    The video also shows a TV airing a news confer...
98400                                                     
Name: extractive_summary, Length: 98401, dtype: object

In [61]:
summary_df = data.copy()
summary_df = summary_df.drop(columns=["text"])
summary_df = pd.concat([summary_df, res_ext], axis=1)
summary_df

Unnamed: 0,headlines,extractive_summary
0,upGrad learner switches to career in ML & Al w...,upGrad's Online Power Learning has powered 3 l...
1,Delhi techie wins free food from Swiggy for on...,Users get one CRED coin per rupee of bill paid...
2,New Zealand end Rohit Sharma-led India's 12-ma...,The match witnessed India getting all out for ...
3,Aegon life iTerm insurance plan helps customer...,"Also, customers have options to insure against..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...
...,...,...
98396,CRPF jawan axed to death by Maoists in Chhatti...,A CRPF jawan was on Tuesday axed to death with...
98397,First song from Sonakshi Sinha's 'Noor' titled...,"'Uff Yeh', the first song from the Sonakshi Si..."
98398,'The Matrix' film to get a reboot: Reports,Michael B Jordan will reportedly play the lead...
98399,Snoop Dogg aims gun at clown dressed as Trump ...,The video also shows a TV airing a news confer...


# 추상적 요약 Abstractive Summarization
## 1. 전처리

In [73]:
data = pd.read_csv('news_summary_more.csv', encoding='iso-8859-1')

In [74]:
# 데이터 개수 파악
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [75]:
# 데이터 중복이 아닌 데이터 개수 확인
print("헤드라인의 중복이 아닌 개수 : ", data["headlines"].nunique())
print("중복이 아닌 텍스트 개수 : ", data["text"].nunique())

헤드라인의 중복이 아닌 개수 :  98280
중복이 아닌 텍스트 개수 :  98360


In [76]:
# 텍스트는 98401개이나 중복이 아닌 텍스트가 98,360으로 중복 데이터가 41개 존재
# 위 결과 중복된 텍스트가 존재하므로 중복을 제거하자

data.drop_duplicates(subset=["text"], inplace=True)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98360 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98360 non-null  object
 1   text       98360 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB
None


In [77]:
"""
중복 텍스트를 제거한 결과 헤드라인과 텍스트가 98,360개가 되었다.
이번에는 널 데이터를 확인하고 제거하자
"""
print(data.isnull().sum())

headlines    0
text         0
dtype: int64


In [78]:
# 널처리를 하려고했으나 널데이터가 존재하지 않는다
# 다음으로 텍스트 정규화 사전을 준비하자.
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

len(contractions)

import re
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences


120

In [79]:
# 정규화 사전을 준비하였으니 이번에는 불용어를 확인해보자
from nltk.corpus import stopwords
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [101]:
# 정규표현식과 정규화 사전을 이용하여 전처리를 하고, 토큰화를 시키자
import re

def preprocessing(sentence, remove_stopwords=True):
    sentence = sentence.lower()
    sentence = re.sub(r"\([^)]*\)","",sentence)
    sentence = re.sub('"', '', sentence)
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")])
    sentence = re.sub(r"'s\b","", sentence)
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    sentence = re.sub("[m]{2,}","mm", sentence)
    
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stopwords.words('english') if len(word) > 1)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens

In [102]:
temp_text = 'Everything I bought was great, infact I ordered twice and the third ordered was<br />for my mother and father.'
temp_summary = 'Great way to start (or finish) the day!!!'

print(preprocessing(temp_text))
print(preprocessing(temp_summary, False))  # 불용어를 제거하지 않습니다.

everything bought great infact ordered twice third ordered br mother father
great way to start the day


In [None]:
preprocessed_text = []
preprocessed_headlines = []

for headline in data["headlines"]:
    preprocessed_headlines.append(preprocessing(headline, False))

for text in data["text"]:
    preprocessed_text.append(preprocessing(text))