In [4]:
import os
import json
import re
import string
import pandas as pd

In [8]:
def split_into_sentences(text):
    """
    This function can split the entire text of Huckleberry Finn into sentences in about 0.1 seconds
    and handles many of the more painful edge cases that make sentence parsing non-trivial 
    e.g. "Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining 
    Nike Inc. as an engineer. He also worked at craigslist.org as a business analyst."
    """
    
    alphabets= "([A-Za-z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace("."," .<stop>")#     text = text.replace(".",".<stop>")
    text = text.replace("?"," ?<stop>")#     text = text.replace("?","?<stop>")
    text = text.replace("!"," !<stop>")#     text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    
    text = text.replace('"', ' " ')
    text = text.replace("\'s", " \'s")
    text = text.replace(",", " ,")
    
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    return sentences

In [2]:
path = 'Data/news/'

In [5]:
datapaths = os.listdir(path)

In [6]:
df = pd.DataFrame()
for p in datapaths:
    with open(path + p, 'r') as f:
        data = json.load(f)

    dataframe = pd.DataFrame.from_dict(data)
    df = df.append(dataframe)

In [9]:
df = df.reset_index(drop=True)
df['index'] = df.index

In [10]:
df

Unnamed: 0,title,author,time,description,body,section,index
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,0
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,1
2,Parliament approves nominees for chief state a...,Yonhap,2017-12-29 20:41:00,South Korea's parliament on Friday passed moti...,South Korea's parliament on Friday passed moti...,Politics,2
3,70th aftershock of Pohang quake occurs on Chri...,Lim Jeong-yeo,2017-12-25 16:54:00,"A 3.5 magnitude earthquake occurred in Pohang,...","A 3.5 magnitude earthquake occurred in Pohang,...",Social affairs,3
4,Uzbek president to make state visit to Korea,Yonhap,2017-11-17 16:16:00,The president of Uzbekistan will come to South...,The president of Uzbekistan will come to South...,Politics,4
5,"S. Korea, US stress diplomatic solution for NK...",Jung Min-kyung,2017-11-17 16:08:00,The top nuclear envoys of South Korea and the ...,The top nuclear envoys of South Korea and the ...,North Korea,5
6,Two ex-NIS chiefs arrested in bribery scandal,Jo He-rim,2017-11-17 15:18:00,Two former National Intelligence Service chief...,Two former National Intelligence Service chief...,Politics,6
7,Prosecution probes haunt political circles,Yonhap,2017-11-17 14:02:00,A sense of foreboding gripped political circle...,A sense of foreboding gripped political circle...,Social affairs,7
8,Ex-head of state lender gets heavier sentence ...,Yonhap,2017-11-17 14:00:00,An appellate court sentenced former state-run ...,An appellate court sentenced former state-run ...,Social affairs,8
9,NK declares nuclear negos impossible ahead of ...,Yonhap,2017-11-17 13:12:00,A North Korean mouthpiece asserted Friday that...,A North Korean mouthpiece asserted Friday that...,North Korea,9


In [11]:
df['clean_body'] = pd.DataFrame(df[' body'].apply(lambda x: split_into_sentences(x)))

In [18]:
sentence_df = df['index'and 'clean_body']