In [1]:

from datasets import load_dataset
import pandas as pd
import random

In [2]:
datasets_email = load_dataset('cnn_dailymail','3.0.0')
datasets_shakespeare = load_dataset('tiny_shakespeare')

In [3]:
def pre_process_email(datasets_email):
    # combine cnn_dailymail train, validation, test to one dataset
    datasets_email_train = datasets_email['train'].to_pandas()
    datasets_email_val = datasets_email['validation'].to_pandas()
    datasets_email_test = datasets_email['test'].to_pandas()
    datasets_email_all = pd.concat([datasets_email_train, datasets_email_val, datasets_email_test])
    datasets_email_all_highlights = datasets_email_all['highlights']
    
    # iterate through all row
    datasets_email_sentence = []
    for index, row in datasets_email_all_highlights.items():
        data = row.split('.')
        for sentence in data:
            datasets_email_sentence.append(sentence)
            
    # remove ""
    datasets_email_sentence = [x for x in datasets_email_sentence if x != '']

    # remove \n in sentence
    datasets_email_sentence = [x.replace('\n','') for x in datasets_email_sentence]

    # remove ultra space
    datasets_email_sentence = [x.strip() for x in datasets_email_sentence]

    # remove sentence with length less than 5
    datasets_email_sentence = [x for x in datasets_email_sentence if len(x) > 5]
    
    return datasets_email_sentence

In [4]:
def pre_process_shakespeare(datasets_shakespeare):
    # combine tiny_shakespeare train, validation, test to one dataset
    datasets_shakespeare_train = datasets_shakespeare['train'].to_pandas()
    datasets_shakespeare_val = datasets_shakespeare['validation'].to_pandas()
    datasets_shakespeare_test = datasets_shakespeare['test'].to_pandas()
    datasets_shakespeare_all = pd.concat([datasets_shakespeare_train, datasets_shakespeare_val, datasets_shakespeare_test])
    datasets_shakespeare_all_text = datasets_shakespeare_all['text']
    
    # iterate through all row
    datasets_shakespeare_sentence = []
    for index, row in datasets_shakespeare_all_text.items():
        data = row.split('.')
        for sentence in data:
            datasets_shakespeare_sentence.append(sentence)
            
    # remove ""
    datasets_shakespeare_sentence = [x for x in datasets_shakespeare_sentence if x != '']

    # remove \n in sentence
    datasets_shakespeare_sentence = [x.replace('\n','') for x in datasets_shakespeare_sentence]

    # remove ultra space
    datasets_shakespeare_sentence = [x.strip() for x in datasets_shakespeare_sentence]

    # remove sentence with length less than 5
    datasets_shakespeare_sentence = [x for x in datasets_shakespeare_sentence if len(x) > 5]
    
    return datasets_shakespeare_sentence

In [5]:
def combine_datasets(datasets_email_sentence, datasets_shakespeare_sentence):
    # two list should be same length, randomly sampling 
    random.seed(42)
    random.shuffle(datasets_email_sentence)
    datasets_email_sentence = datasets_email_sentence[:len(datasets_shakespeare_sentence)]
    
    # create dataframe datasets_email_sentence with label 0, datasets_shakespeare_sentence with label 1
    datasets_email_sentence = pd.DataFrame(datasets_email_sentence, columns=['sentence'])
    datasets_email_sentence['label'] = 0
    datasets_shakespeare_sentence = pd.DataFrame(datasets_shakespeare_sentence, columns=['sentence'])
    datasets_shakespeare_sentence['label'] = 1
    
    # combine two dataframe
    datasets_combine = pd.concat([datasets_email_sentence, datasets_shakespeare_sentence])
    
    # shuffle dataframe
    datasets_combine = datasets_combine.sample(frac=1).reset_index(drop=True)
    
    # write to csv
    datasets_combine.to_csv('datasets_combine.csv', index=False)

In [69]:
datasets_email_sentence = pre_process_email(datasets_email)
datasets_shakespeare_sentence = pre_process_shakespeare(datasets_shakespeare)
combine_datasets(datasets_email_sentence, datasets_shakespeare_sentence)

In [6]:
datasets_email_sentence = pre_process_email(datasets_email)

In [7]:
datasets_email_sentence

['Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday',
 'Young actor says he has no plans to fritter his cash away',
 "Radcliffe's earnings from first five Potter films have been held in trust fund",
 'Mentally ill inmates in Miami are housed on the "forgotten floor"Judge Steven Leifman says most are there as a result of "avoidable felonies"While CNN tours facility, patient shouts: "I am the son of the president"Leifman says the system is unjust and he\'s fighting for change',
 'NEW: "I thought I was going to die," driver says',
 'Man says pickup truck was folded in half; he just has cut on face',
 'Driver: "I probably had a 30-, 35-foot free fall"Minnesota bridge collapsed during rush hour Wednesday',
 'Five small polyps found during procedure; "none worrisome," spokesman says',
 'President reclaims powers transferred to vice president',
 'Bush undergoes routine colonoscopy at Camp David',
 "NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's condu

In [15]:
datasets_email['train']['highlights'][1].split('.') 

['Mentally ill inmates in Miami are housed on the "forgotten floor"\nJudge Steven Leifman says most are there as a result of "avoidable felonies"\nWhile CNN tours facility, patient shouts: "I am the son of the president"\nLeifman says the system is unjust and he\'s fighting for change ',
 '']