In [1]:
import sys
from os import listdir
import re
import argparse
import csv
import string

import numpy as np
import pandas as pd
#import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LaughingMan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data_path = "cnn/stories"
save_file = "full_dataset"
stop_words = set(stopwords.words('english'))
stories_maxlen = 500
highlights_maxlen = 100


In [3]:
def load_doc(filename):
    """ Загружаем данные в переменную """
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

def split_story(doc):
    """ Делим файл на саму статью и заголовок """
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

def load_stories(directory):
    """ Сохраняем данные в директорию """
    stories = []
    for name in listdir(directory):
        filename = directory + '/' + name
        doc = load_doc(filename)
        story, highlights = split_story(doc)
        stories.append({'story': story, 'highlights': highlights})
    return stories

In [5]:
""" Обрабатываем сленг """
def change_contractions(word):
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
    }
    if word in contractions.keys():
        word = contractions[word]
    return word

In [6]:
def clean_lines(lines):
    """Ищем CNN """
    cleaned = ""
    for line in lines:
        index = line.find('(CNN) -- ')
        if index > -1:
            line = line[index+len('(CNN)'):]
        # remove CNN titles
        line = line.replace('(CNN)', '')
        if "contributed to this report" in line:
            line = " "

        # replace - and / with space to avoid compund words
        line = line.replace('-', ' ').replace('/', ' ')
        # remove wierd characters
        line = re.sub(r'[\?\!\"\*\&\:\.\,\(\)\$\;\»\Ã\©\Ã\±\@\#\%\•\+]', '', line)
        # tokenize on white space
        line = line.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # replace word contractions will full words
        line = [change_contractions(word) for word in line]
        # remove empty lines
        line = [c for c in line if len(c) > 0]
        line = " ".join(line)
        # remove aphostrophes (after changing contractions)
        line = re.sub(r'(\')', '', line)
        # remove more than 2 whitespaces
        line = re.sub(r'[ ]{2,}', ' ', line)
        if not len(line) == 0:
            cleaned = cleaned+line+" . "

    cleaned = cleaned[:-3]
    #cleaned = re.sub(r'( \. )', ' <e> <s> ', cleaned)
    cleaned = re.sub(r'( \. )', ' ', cleaned)
    return cleaned

In [7]:
def cut_stories(cleaned, max_len):
    cleaned = cleaned.split(" ")
    '''Удаляем все стоп-слова'''
    cleaned = [c for c in cleaned if not c in stop_words]
    cleaned = cleaned[0:max_len]
    cleaned = " ".join(cleaned)

    return cleaned

In [9]:
def split_data(stories, size=0.20):
    # Splitting data into sets
    trainset, testset = train_test_split(stories, test_size=size)
    # Printing the lengths of the sets
    return trainset, testset

In [13]:
'''Отделяем заголовки'''
def cut_highlights(cleaned, max_len):
    cleaned = cleaned.split(" ")
    cleaned = cleaned[0:max_len]
    cleaned = " ".join(cleaned)

    return cleaned

In [14]:
'''Отчищаем раздел с историями'''
def clean_stories(data_path, save_file):
    print("Loading stories...")
    directory = data_path
    stories = load_stories(directory)
    print("Loaded number of stories: {}.".format(len(stories)))
    
    # clean stories
    for example in stories:
        example["story"] = clean_lines(example['story'].split('\n'))
        example["highlights"] = clean_lines(example["highlights"])
        example["story"] = cut_stories(example['story'], 400)
        example["highlights"] = cut_highlights(example["highlights"], 100)
        if example["story"] == "":
            example["story"] = np.nan # way of dropping empty lines from pd object later

    # split text
    print("Splitting stories into train and test...")
    train, test = split_data(stories)
    train, val = split_data(train)
    print("Number of stories in training set: {}".format(len(train)))
    print("Number of stories in validation set: {}".format(len(val)))
    print("Number of stories in test set: {}".format(len(test)))

    print("Writing data to files...")
    test_filename = save_file + "_train.csv"
    val_filename = save_file + "_val.csv"
    train_filename = save_file + "_test.csv"

    df1 = pd.DataFrame.from_dict(train)
    df2 = pd.DataFrame.from_dict(test)
    df3 = pd.DataFrame.from_dict(val) 
    
    # Drop rows with any empty cells
    df1.dropna(how='any', inplace=True)  
    df2.dropna(how='any', inplace=True)   
    df3.dropna(how='any', inplace=True)

    # write to file
    df3.to_csv(val_filename, encoding='utf-8', index=False)
    df1.to_csv(train_filename, encoding='utf-8', index=False)
    df2.to_csv(test_filename, encoding='utf-8', index=False)

    print("Finished processing data, saved train, validation and test files as {}, {} and {} ".format(
        train_filename, val_filename, test_filename))

In [15]:
clean_stories(data_path, save_file)

Loading stories...
Loaded number of stories: 92579.
Splitting stories into train and test...
Number of stories in training set: 59250
Number of stories in validation set: 14813
Number of stories in test set: 18516
Writing data to files...
Finished processing data, saved train, validation and test files as full_dataset_test.csv, full_dataset_val.csv and full_dataset_train.csv 
