In [298]:
from os import listdir
import string
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
porter_stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

In [299]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [300]:
# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

In [301]:
# load all stories in a directory
def load_stories(directory):
	all_stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		all_stories.append({'story':story, 'highlights':highlights})
	return all_stories

In [302]:
# load stories
directory = 'cnn/examples/'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))
stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
            'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
            'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
            "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
            'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 
            'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
            'through', 'during', 'before', 'after', 'above', 'below', 'to',
            'from', 'up', 'down', 'in', 'out', 'on',
            'off', 'over', 'under', 'again', 'further',
            'then', 'once', 'here', 'there', 'when', 'where',            
            'why', 'how', 'all', 'any', 'both', 'each', 'few', 
            'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 
            's', 't', 'can', 'will', 'just', 'should', 
            "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ma']

Loaded Stories 20


In [303]:
#print("Story: " + stories[1]['story'])
print("Highlights: " + str(stories[1]['highlights']))

Highlights: ['NEW: Bermudan premier: "Above all, this was a humanitarian act"', 'Uyghurs are native Chinese Muslims; the detainees were apprehended in Pakistan', 'China urges U.S. to hand over all 17 Uyghurs held at Guantanamo Bay, Cuba', 'Official says U.S. still negotiating with Palau to take remaining 13 Uyghurs']


In [304]:
# clean a list of lines
def clean_lines(lines):
	cleaned = list()
	# prepare a translation table to remove punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# strip source cnn office if it exists
		index = line.find('(CNN) -- ')
		if index > -1:
			line = line[index+len('(CNN)'):]
		# tokenize on white space
		line = word_tokenize(line)
		# convert to lower case
		line = [word.lower() for word in line]
		# remove stop words        
		line = [word for word in line if word not in stop_words]
		# remove punctuation from each token
		line = [w.translate(table) for w in line]
		# lemmatization
		line = [wnl.lemmatize(word) for word in line]
		# remove spaces
		line = [word for word in line if len(word.strip()) > 0]
		# store as string
		cleaned.append(' '.join(line))
	# remove empty strings
	cleaned = [c for c in cleaned if len(c) > 0]
	return cleaned

In [305]:
# clean stories
for example in stories:
	example['story'] = clean_lines(example['story'].split('\n'))
	example['highlights'] = clean_lines(example['highlights'])

In [306]:
# save to file
from pickle import dump, load
dump(stories, open('cnn_dataset.pkl', 'wb'))

In [307]:
# load from file
stories = load(open('cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

Loaded Stories 20


In [308]:
print(stories[1]['highlights'])

['new bermudan premier humanitarian act', 'uyghurs native chinese muslim detainee apprehended pakistan', 'china urge u hand 17 uyghurs held guantanamo bay cuba', 'official say u still negotiating palau take remaining 13 uyghurs']
