In [1]:
import os
import json
import re
import string
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora
import gensim

In [2]:
def text_process(text):
    #number removal
    body = re.sub(r'\d+', '', text)
    
    #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]
    punc = string.punctuation
    punc += "“”’"
#     punc = re.sub("-","", punc)
    body = body.translate(body.maketrans(punc, "                                   "))
    
    #text lower
    body = body.lower()
    
    #multi-word tokenize
    multi_word_list = [('north', 'korea'), ('south', 'korea'), ('north', 'korean'), ('south', 'korean'),
                      ('kim', 'jong', 'un'), ('park', 'geun', 'hye')]
    tokenizer = MWETokenizer()
    for mw in multi_word_list:
        tokenizer.add_mwe(mw)
    text = tokenizer.tokenize(body.split())
    
    #stopwort removal
    stopset = set(stopwords.words('english'))
#     text = word_tokenize(body)
    text = [x for x in text if x not in stopset]
    
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_text = [lemmatizer.lemmatize(x) for x in text]
    
    return lemma_text

In [3]:
def text_process2(text):
    #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]
    punc = string.punctuation
    punc += "“”’"
#     punc = re.sub("-","", punc)
    text = text.translate(text.maketrans(punc, " "*len(punc)))
    
    #text lower
    text = text.lower()
    
    return text

In [4]:
def split_into_sentences(text):
    """
    This function can split the entire text of Huckleberry Finn into sentences in about 0.1 seconds
    and handles many of the more painful edge cases that make sentence parsing non-trivial 
    e.g. "Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining 
    Nike Inc. as an engineer. He also worked at craigslist.org as a business analyst."
    """
    
    alphabets= "([A-Za-z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace("."," .<stop>")#     text = text.replace(".",".<stop>")
    text = text.replace("?"," ?<stop>")#     text = text.replace("?","?<stop>")
    text = text.replace("!"," !<stop>")#     text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    
    text = text.replace('"', ' " ')
    text = text.replace("\'s", " \'s")
    text = text.replace(",", " ,")
    
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
path = 'Data/news/'

In [7]:
datapaths = os.listdir(path)

In [64]:
df = pd.DataFrame()
for p in datapaths:
    with open(path + p, 'r') as f:
        data = json.load(f)

    dataframe = pd.DataFrame.from_dict(data)
    df = df.append(dataframe)

In [65]:
df

Unnamed: 0,title,author,time,description,body,section
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...,...
2765,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
2766,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea
2767,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
2768,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea


In [66]:
df = df.drop(' author', axis=1)

In [82]:
df = df.reset_index(drop=True)
df['index'] = df.index

In [27]:
df[' time'] = pd.to_datetime(df[' time'])
df = df.sort_values([' time'])
df = df.set_index(' time')

In [28]:
df

Unnamed: 0_level_0,title,description,body,section,index
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-01 09:24:00,Park calls for military readiness amid tension...,President Park Geun-hye called on the military...,President Park Geun-hye called on the military...,Defense,23768
2015-01-01 09:25:00,U.S. imposes sanctions on N. Korean firm,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,23767
2015-01-01 09:27:00,Ex-U.S. envoy calls for clearer communication ...,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea,23766
2015-01-01 10:05:00,N. Korean leader open to inter-Korean summit t...,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea,23765
2015-01-01 13:36:00,N. Korean leader's speech arouses cautious opt...,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea,23764
...,...,...,...,...,...
2017-12-30 15:44:00,Hong Kong ship crew questioned in S. Korea for...,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,4
2017-12-31 14:55:00,[Newsmaker] Panamanian vessel probed over susp...,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,3
2017-12-31 16:18:00,People's Party members support Ahn's push for ...,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,2
2018-01-01 13:22:00,[Weekender] Korea’s dynamic 2017,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,1


In [29]:
df[' section'].unique()

array(['Defense', 'North Korea', 'Education', 'Social affairs',
       'Politics', 'Foreign Policy', 'National',
       '사용안함 - Diplomatic Circuit', 'International', '사용안함 - Environment',
       'Foreign  Affairs', '', 'Science', '사용안함 - Sharing',
       'Diplomatic Circuit'], dtype=object)

In [35]:
# df['clean_body'] = pd.DataFrame(df[' body'].apply(lambda x: ' '.join(text_process(x))))
df['clean_body'] = pd.DataFrame(df[' body'].apply(lambda x: text_process2(x)))

In [36]:
df2 = df[df[' section'] == 'North Korea']

In [37]:
df2

Unnamed: 0_level_0,title,description,body,section,index,clean_body
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01 09:25:00,U.S. imposes sanctions on N. Korean firm,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,23767,the united states has imposed sanctions on a n...
2015-01-01 09:27:00,Ex-U.S. envoy calls for clearer communication ...,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea,23766,the united states should make its thoughts on ...
2015-01-01 10:05:00,N. Korean leader open to inter-Korean summit t...,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea,23765,north korean leader kim jong un said thursday ...
2015-01-01 13:36:00,N. Korean leader's speech arouses cautious opt...,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea,23764,north korean leader kim jong un s new year s d...
2015-01-01 21:22:00,U.S. places sanctions on N. Korean firm,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,23757,the united states has imposed sanctions on a n...
...,...,...,...,...,...,...
2017-12-29 18:02:00,[Newsmaker] NK nuclear scientist takes own lif...,"A North Korean nuclear scientist, who was caug...","A North Korean nuclear scientist, who was caug...",North Korea,11,a north korean nuclear scientist who was caug...
2017-12-30 10:31:00,N. Korea says there will be no change to its n...,North Korea will continue to enhance its nucle...,North Korea will continue to enhance its nucle...,North Korea,8,north korea will continue to enhance its nucle...
2017-12-30 12:10:00,Secret Sauce? Kim Jong-un applies science to k...,Kim Jong Un wants to turn the art of kimchi-ma...,Kim Jong Un wants to turn the art of kimchi-ma...,North Korea,7,kim jong un wants to turn the art of kimchi ma...
2017-12-30 15:44:00,Hong Kong ship crew questioned in S. Korea for...,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,4,the crew of a hong kong registered ship have b...


In [38]:
tokenized_doc = df2['clean_body'].apply(lambda x: x.split())

In [39]:
df2['clean_body'][11]



In [41]:
df2[' body'][11]



In [371]:
tokenized_doc[9]

['kim',
 'yo',
 'jongthe',
 'younger',
 'sister',
 'north_korean',
 'leader',
 'kim_jong_un',
 'married',
 'last',
 'year',
 'son',
 'choe',
 'ryong',
 'hae',
 'one',
 'kim',
 'closest',
 'aide',
 'source',
 'said',
 'friday',
 'far',
 'know',
 'kim',
 'yo',
 'jong',
 'deputy',
 'director',
 'worker',
 'party',
 'got',
 'married',
 'son',
 'party',
 'secretary',
 'choe',
 'ryong',
 'hae',
 'said',
 'source',
 'based',
 'china',
 'choe',
 'thought',
 'powerful',
 'official',
 'communist',
 'nation',
 'behind',
 'leader',
 'kim',
 'choe',
 'two',
 'son',
 'daughter',
 'according',
 'another',
 'source',
 'second',
 'son',
 'choe',
 'song',
 'husband',
 'added',
 'source',
 'reportedly',
 'early',
 'job',
 'title',
 'confirmed',
 'yet',
 'earlier',
 'friday',
 'north',
 'medium',
 'released',
 'photo',
 'kim',
 'yo',
 'jong',
 'accompanied',
 'brother',
 'visit',
 'pyongyang',
 'orphanage',
 'new',
 'year',
 'day',
 'showed',
 'wearing',
 'appears',
 'wedding',
 'ring',
 'ring',
 'finger'

In [372]:
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [375]:
NUM_TOPICS = 5 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

In [376]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.021*"north_korea" + 0.018*"nuclear" + 0.015*"said" + 0.014*"sanction" + 0.013*"north" + 0.012*"u" + 0.011*"china" + 0.008*"security" + 0.008*"pyongyang" + 0.008*"foreign"')
(1, '0.030*"missile" + 0.020*"said" + 0.018*"test" + 0.018*"north_korea" + 0.018*"nuclear" + 0.016*"north" + 0.014*"u" + 0.012*"launch" + 0.011*"military" + 0.010*"ballistic"')
(2, '0.018*"said" + 0.017*"north" + 0.015*"korean" + 0.013*"north_korea" + 0.011*"north_korean" + 0.009*"seoul" + 0.009*"south_korea" + 0.009*"south" + 0.008*"year" + 0.008*"government"')
(3, '0.028*"u" + 0.024*"said" + 0.020*"north_korea" + 0.013*"trump" + 0.010*"state" + 0.009*"north" + 0.009*"nuclear" + 0.008*"president" + 0.007*"right" + 0.007*"pyongyang"')
(4, '0.028*"kim" + 0.016*"said" + 0.015*"leader" + 0.013*"north_korean" + 0.011*"party" + 0.011*"north" + 0.010*"north_korea" + 0.010*"kim_jong_un" + 0.008*"country" + 0.007*"official"')


In [377]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(0, 0.6716795), (2, 0.15562485), (3, 0.1686793)]
1 번째 문서의 topic 비율은 [(0, 0.23722433), (3, 0.7599058)]
2 번째 문서의 topic 비율은 [(0, 0.28735483), (2, 0.34559256), (3, 0.13658771), (4, 0.22979215)]
3 번째 문서의 topic 비율은 [(0, 0.29638427), (2, 0.40801853), (3, 0.06821963), (4, 0.2268738)]
4 번째 문서의 topic 비율은 [(0, 0.6639519), (2, 0.1479353), (3, 0.18413576)]


In [378]:
topics = []
for i, topic_list in enumerate(ldamodel[corpus]):
    
    doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
    doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
    
    topics.append(doc[0][0])

In [379]:
df2['topic'] = topics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [380]:
df2[df2['topic'] == 0]

Unnamed: 0_level_0,title,description,body,section,index,clean_body,topic
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01 09:25:00,U.S. imposes sanctions on N. Korean firm,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,23767,united state imposed sanction north_korean fir...,0
2015-01-01 21:22:00,U.S. places sanctions on N. Korean firm,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,23757,united state imposed sanction north_korean fir...,0
2015-01-01 21:33:00,N.K. leader says open to summit with South Korea,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea,23756,north_korean leader kim_jong_un said thursday ...,0
2015-01-02 21:36:00,North Korea ratchets up peace offensive,North Korea appears to have set out to step up...,North Korea appears to have set out to step up...,North Korea,23743,north_korea appears set step peace offensive s...,0
2015-01-09 12:26:00,China hopes to 'move forward' on ties with N. ...,"China on Friday voiced hope that it will ""move...","China on Friday voiced hope that it will ""move...",North Korea,23619,china friday voiced hope move forward warmer r...,0
...,...,...,...,...,...,...,...
2017-12-29 11:13:00,Major N. Korean ports handling bulk coal show ...,Nampho and other major North Korean ports hand...,Nampho and other major North Korean ports hand...,North Korea,22,nampho major north_korean port handling bulk c...,0
2017-12-29 15:56:00,S. Korea seizes HK vessel over ship-to-ship oi...,A Hong Kong-flagged vessel has been seized and...,A Hong Kong-flagged vessel has been seized and...,North Korea,18,hong kong flagged vessel seized inspected sout...,0
2017-12-29 16:31:00,"Top diplomats of S. Korea, US reassure peacefu...",The top diplomats of South Korea and the Unite...,The top diplomats of South Korea and the Unite...,North Korea,14,top diplomat south_korea united state held tel...,0
2017-12-30 15:44:00,Hong Kong ship crew questioned in S. Korea for...,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,4,crew hong kong registered ship detained questi...,0


In [222]:
df2[df2['topic'] == 0].iloc[150]['title']

"Int'l human rights group urges NK leader to halt forced labor"

In [223]:
df2[df2['topic'] == 0].iloc[150][' body']

'An international human rights group is urging North Korean leader Kim Jong-un to halt forcing its people to engage in forced labor without due compensation, a report by a U.S.-based media outlet said Friday.The report carried by Radio Free Asia (RFA) said the Human Rights Watch sent a letter to Kim calling for an end to mandatory forced labor, and requested that Pyongyang become a member of the International Labor Organization under the United Nations.The move by the New York-headquartered non-governmental organization (NGO) comes as a large number of North Korean citizens were forced to work ahead of the rare Workers\' Party of Korea\'s congress that kicked off earlier in the day.North Korea had mobilized ordinary citizens to prepare for the congress under a "70-day campaign of loyalty" and forced them to work longer hours and even donate money to authorities. This has caused a rise in discontent among ordinary people, according to government officials in Seoul.The loyalty campaign a

In [381]:
df2['2015-1-9']

Unnamed: 0_level_0,title,description,body,section,index,clean_body,topic
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-09 09:56:00,U.S. has no evidence yet showing N.K. mastered...,The United States has no evidence yet that Nor...,The United States has no evidence yet that Nor...,North Korea,23625,united state evidence yet north_korea mastered...,1
2015-01-09 09:58:00,N. Korea trying to make submarine capable of f...,North Korea appears to be trying to equip a su...,North Korea appears to be trying to equip a su...,North Korea,23624,north_korea appears trying equip submarine mak...,1
2015-01-09 12:26:00,China hopes to 'move forward' on ties with N. ...,"China on Friday voiced hope that it will ""move...","China on Friday voiced hope that it will ""move...",North Korea,23619,china friday voiced hope move forward warmer r...,0
2015-01-09 12:40:00,N. Korea developing sub-based missiles: US thi...,Recent satellite images offer fresh evidence o...,Recent satellite images offer fresh evidence o...,North Korea,23618,recent satellite image offer fresh evidence no...,1
2015-01-09 15:16:00,Seoul hints at stopping DVD launch across border,South Korea again signaled Friday that it may ...,South Korea again signaled Friday that it may ...,North Korea,23616,south_korea signaled friday may block local ac...,2
2015-01-09 20:09:00,N. Korea rejects South's parliamentary resolut...,North Korea refused Friday to accept a South K...,North Korea refused Friday to accept a South K...,North Korea,23611,north_korea refused friday accept south_korean...,2
2015-01-09 20:28:00,China signals warmer ties with North Korea,China on Friday offered its clearest signal ye...,China on Friday offered its clearest signal ye...,North Korea,23607,china friday offered clearest signal yet ready...,0


In [363]:
df2['2015-1-3'].iloc[0][' body']



In [83]:
def split_into_sentences(text):
    """
    This function can split the entire text of Huckleberry Finn into sentences in about 0.1 seconds
    and handles many of the more painful edge cases that make sentence parsing non-trivial 
    e.g. "Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining 
    Nike Inc. as an engineer. He also worked at craigslist.org as a business analyst."
    """
    
    alphabets= "([A-Za-z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace("."," .<stop>")#     text = text.replace(".",".<stop>")
    text = text.replace("?"," ?<stop>")#     text = text.replace("?","?<stop>")
    text = text.replace("!"," !<stop>")#     text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    
    text = text.replace('"', ' " ')
    text = text.replace("\'s", " \'s")
    text = text.replace(",", " ,")
    
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [85]:
#raw text
split_into_sentences('The United States announced retaliatory sanctions on North Korea on Friday in response to the communist nation\'s alleged cyber-attacks on Sony Pictures, warning the actions are just the "first aspect" of its response.President Barack Obama signed an executive order authorizing additional sanctions on North Korean individuals and entities in response to the North\'s "ongoing provocative, destabilizing, and repressive actions and policies, particularly its destructive and coercive cyber attack on Sony," the White House said in a statement.Three North Korean entities and 10 officials were named in the sanctions, including the Reconnaissance General Bureau, Pyongyang\'s primary intelligence organization, accused of arms trading and other activities banned under U.N. resolutions, according to the Treasury Department.Though those sanctioned are barred from using the U.S. financial system and U.S. citizens are banned from doing business with them, the measures are considered largely symbolic because the North has already been under a string of international sanctions and those newly sanctioned are not believed to have any dealings with the U.S."We take seriously North Korea\'s attack that aimed to create destructive financial effects on a U.S. company and to threaten artists and other individuals with the goal of restricting their\xa0right to free expression," the White House said."As the president has said, our response to North Korea\'s attack against Sony Pictures Entertainment will be proportional, and will take place at a time and in a manner of our choosing.Today\'s actions are the first aspect of our response," it said.The FBI has determined that North Korea was behind the hack on Sony, confirming widespread suspicions pointing to the North that has expressed strong anger at a Sony movie, "The Interview," which involves a plot to assassinate North Korean leader Kim Jong-un. Obama has since vowed to "respond proportionally" to the attacks.North Korea has denied any responsibility, though it lauded the Sony hack as a "righteous deed.""The order is not targeted at the people of North Korea, but rather is aimed at the government of North Korea and its activities that threaten the United States and others," Obama said in a letter to House of Representatives and Senate leaders.The two other newly sanctioned North Korean entities are Korea Mining Development and Trading Corp. (KOMID) and Korea Tangun Trading Corp. Eight of the 10 sanctioned individuals were KOMID officials stationed in Iran, Syria, Russia and Namibia.KOMID is the North\'s primary arms dealer and main exporter of goods and equipment related to ballistic missiles and conventional weapons, according to the Treasury Department. The company was previously sanctioned by the U.S. and the United Nations, it said.Korea Tangun Trading Corp. is responsible for the procurement of commodities and technologies to support the North\'s defense research and development program. The company was also a target of U.S. and U.N. sanctions, the department said.The sanctioned individuals include KOMID officials Kil Jong-hun, Kim Kwang-yon, Jang Song-chol, Kim Yong-chol, Jang Yong-son, Kim Kyu, Ryu Jin and Kang Ryong, as well as Yu Kwang-ho, a North Korean government official, and Kim Kwang-chun, a Tangun Trading Corp. official."Today\'s actions are driven by our commitment to hold North Korea accountable for its destructive and destabilizing conduct," Secretary of the Treasury Jacob Lew said in a statement. "Even as the FBI continues its investigation into the cyber-attack against Sony Pictures Entertainment, these steps underscore that we will employ a broad set of tools to defend U.S. businesses and citizens, and to respond to attempts to undermine our values or threaten the national security of the United States."The new sanctions also underline the confidence the U.S. has in blaming the North for the Sony hack despite growing doubts about the FBI\'s finding among American cyber-security specialists.Last week, a cyber-security firm, Norse, was reported to have briefed the FBI on the result of its own investigation that it was not North Korea, but laid-off Sony staff members that disrupted Sony\'s computer network.On Friday, Scott Borg, director and chief economist of the U.S. Cyber Consequences Unit, an independent, nonprofit research institute specializing on cyber-threats and risks, also said in a commentary on the CNBC website that the skills employed in the Sony hack were too sophisticated for the North. (Yonhap)')

 'President Barack Obama signed an executive order authorizing additional sanctions on North Korean individuals and entities in response to the North \'s  " ongoing provocative , destabilizing , and repressive actions and policies , particularly its destructive and coercive cyber attack on Sony , "  the White House said in a statement .',
 "Three North Korean entities and 10 officials were named in the sanctions , including the Reconnaissance General Bureau , Pyongyang 's primary intelligence organization , accused of arms trading and other activities banned under U.N. resolutions , according to the Treasury Department .",
 'Though those sanctioned are barred from using the U.S. financial system and U.S. citizens are banned from doing business with them , the measures are considered largely symbolic because the North has already been under a string of international sanctions and those newly sanctioned are not believed to have any dealings with the U.S. " We take seriously North Korea \

In [11]:
df.iloc[0]['Person'] = 'trump'

In [43]:
a = df.iloc[0]

In [10]:
ner_dict = {'Geographical Entity': ['united states',
  'north korea',
  'north korean',
  'north',
  'u.n.',
  'considered',
  'north has',
  'sanctioned',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'united states',
  'korea',
  'iran',
  'syria',
  'russia',
  'namibia',
  'north',
  'u.s.',
  'korea',
  'u.s.',
  'u.n.',
  'yu kwang-ho',
  'north korea',
  'u.s.',
  'united states',
  'u.s.',
  'north korea',
  'u.s.',
  'cyber-threats'],
 'Organization': ['sony pictures',
  'sony',
  'white house',
  'reconnaissance general bureau',
  "pyongyang 's primary intelligence organization",
  'treasury department',
  'create destructive',
  'sony pictures entertainment',
  'fbi',
  'sony',
  'sony',
  'sony hack',
  'house of representatives',
  'senate',
  '(komid) and korea tangun',
  'treasury department',
  'united nations',
  "north 's defense research and development program",
  'treasury jacob lew',
  'fbi',
  'sony pictures entertainment',
  'sony hack',
  'fbi',
  'fbi',
  'sony',
  'sony',
  'nonprofit research institute specializing',
  'cnbc',
  'sony hack'],
 'Person': ['president barack obama',
  'kim jong-un',
  'obama',
  'obama',
  'komid',
  'kil jong-hun',
  'kim kwang-yon',
  'jang song-chol',
  'kim yong-chol',
  'jang yong-son',
  'kim kyu',
  'ryu jin',
  'kang ryong',
  'kim kwang-chun',
  'norse',
  'scott borg'],
 'Geopolitical Entity': ['korean', 'korean', 'korean', 'korean', 'american'],
 'Time indicator': ['friday', 'today', '10 sanctioned', 'today', 'friday'],
 'Artifact': [],
 'Event': [],
 'Natural Phenomenon': []}

In [25]:
val_list = list(ner_dict.values())

In [33]:
joined_val_list = []
for val in val_list:
    joined_val = ','.join(val)
    joined_val_list.append(joined_val)

In [34]:
joined_val_list

['considered,cyber-threats,iran,korea,korea,namibia,north,north,north has,north korea,north korea,north korea,north korea,north korea,north korea,north korea,north korea,north korean,russia,sanctioned,syria,u.n.,u.n.,u.s.,u.s.,u.s.,u.s.,u.s.,united states,united states,united states,yu kwang-ho',
 "sony pictures,sony,white house,reconnaissance general bureau,pyongyang 's primary intelligence organization,treasury department,create destructive,sony pictures entertainment,fbi,sony,sony,sony hack,house of representatives,senate,(komid) and korea tangun,treasury department,united nations,north 's defense research and development program,treasury jacob lew,fbi,sony pictures entertainment,sony hack,fbi,fbi,sony,sony,nonprofit research institute specializing,cnbc,sony hack",
 'president barack obama,kim jong-un,obama,obama,komid,kil jong-hun,kim kwang-yon,jang song-chol,kim yong-chol,jang yong-son,kim kyu,ryu jin,kang ryong,kim kwang-chun,norse,scott borg',
 'korean,korean,korean,korean,ameri

In [27]:
val_list

[['considered',
  'cyber-threats',
  'iran',
  'korea',
  'korea',
  'namibia',
  'north',
  'north',
  'north has',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korea',
  'north korean',
  'russia',
  'sanctioned',
  'syria',
  'u.n.',
  'u.n.',
  'u.s.',
  'u.s.',
  'u.s.',
  'u.s.',
  'u.s.',
  'united states',
  'united states',
  'united states',
  'yu kwang-ho'],
 ['sony pictures',
  'sony',
  'white house',
  'reconnaissance general bureau',
  "pyongyang 's primary intelligence organization",
  'treasury department',
  'create destructive',
  'sony pictures entertainment',
  'fbi',
  'sony',
  'sony',
  'sony hack',
  'house of representatives',
  'senate',
  '(komid) and korea tangun',
  'treasury department',
  'united nations',
  "north 's defense research and development program",
  'treasury jacob lew',
  'fbi',
  'sony pictures entertainment',
  'sony hack',
  'fbi',
  'fbi',
  'sony',
  'so

In [40]:
ner_df = pd.DataFrame(data=[joined_val_list], columns=list(ner_dict.keys()))

In [41]:
ner_df

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon
0,"considered,cyber-threats,iran,korea,korea,nami...","sony pictures,sony,white house,reconnaissance ...","president barack obama,kim jong-un,obama,obama...","korean,korean,korean,korean,american","friday,today,10 sanctioned,today,friday",,,


In [45]:
a.append(ner_df, axis=1)

TypeError: append() got an unexpected keyword argument 'axis'

In [84]:
df_c = df.copy()

In [49]:
ner_ = pd.DataFrame()

In [51]:
ner_ = ner_.append(ner_df)

In [53]:
ner_ = ner_.append(ner_df)

In [58]:
ner_ = ner_.reset_index(drop=True)

In [61]:
ner_['index'] = ner_.index

In [88]:
ner_

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon,index
0,"considered,cyber-threats,iran,korea,korea,nami...","sony pictures,sony,white house,reconnaissance ...","president barack obama,kim jong-un,obama,obama...","korean,korean,korean,korean,american","friday,today,10 sanctioned,today,friday",,,,0
1,"considered,cyber-threats,iran,korea,korea,nami...","sony pictures,sony,white house,reconnaissance ...","president barack obama,kim jong-un,obama,obama...","korean,korean,korean,korean,american","friday,today,10 sanctioned,today,friday",,,,1


In [90]:
df_c.iloc[0:2]

Unnamed: 0,title,time,description,body,section,index
0,A snapshot of multiculturalism in South Korea,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,0
1,[Weekender] Korea’s dynamic 2017,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,1


In [91]:
pd.merge(df_c.iloc[0:2], ner_)

Unnamed: 0,title,time,description,body,section,index,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon
0,A snapshot of multiculturalism in South Korea,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,0,"considered,cyber-threats,iran,korea,korea,nami...","sony pictures,sony,white house,reconnaissance ...","president barack obama,kim jong-un,obama,obama...","korean,korean,korean,korean,american","friday,today,10 sanctioned,today,friday",,,
1,[Weekender] Korea’s dynamic 2017,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,1,"considered,cyber-threats,iran,korea,korea,nami...","sony pictures,sony,white house,reconnaissance ...","president barack obama,kim jong-un,obama,obama...","korean,korean,korean,korean,american","friday,today,10 sanctioned,today,friday",,,


In [69]:
df

Unnamed: 0,title,time,description,body,section
0,A snapshot of multiculturalism in South Korea,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...
23764,N. Korean leader's speech arouses cautious opt...,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
23765,N. Korean leader open to inter-Korean summit t...,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea
23766,Ex-U.S. envoy calls for clearer communication ...,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
23767,U.S. imposes sanctions on N. Korean firm,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea
