# Selecting data for model training and testing
- Since GLG is interested in short text topic modeling (abstracts from client request), only part of news text is needed
- Test data is taken from well defined sections to see if all news from those sect6ions go at least to first level cluster

# Load data and python libraries

In [1]:
# data processing libraries
import pandas as pd

# display wider columns in pandas data frames where necessary
pd.set_option('max_colwidth',150)

import spacy
nlp = spacy.load("en_core_web_sm")

# supporting libraries
import re
import pickle



In [2]:
# file location of the data
input_folder = './data/'
output_folder = './transition_files/'

file_name = 'all-the-news-2-1.csv'

In [3]:
# load data
df_data = pd.read_csv(input_folder + file_name, #file location
                      encoding = "ISO-8859-1", #deal with texts in different formats
                     )

# display first row of the data frame
print(df_data.shape)
df_data.head(1).T

  interactivity=interactivity, compiler=compiler, result=result)


(2688879, 12)


Unnamed: 0,0
Unnamed: 0,0
Unnamed: 0.1,0
date,2016-12-09 18:31:00
year,2016
month,12
day,9
author,Lee Drutman
title,We should take concerns about the health of liberal democracy seriously
article,"This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to de..."
url,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs


# Clean and selecting first few paragraphs (10 sentences) for selected publications

In [4]:
# select ONLY data with specified section and publication and non-duplicated texts of article
df_data['publication'] = df_data['publication'].fillna("")
df_data = df_data[df_data['publication'].apply(len)>0]

df_data['section'] = df_data['section'].fillna("")
df_data = df_data[df_data['section'].apply(len)>0]

df_data['article'] = df_data['article'].fillna("")
df_data = df_data[df_data['article'].apply(len)>0]
df_data = df_data.drop_duplicates('article')

df_data.shape

(1660535, 12)

In [5]:
#Publications in the data
print('Number of unique values:')
df = df_data.groupby('publication')[['section', 'article']].nunique()
df

Number of unique values:


Unnamed: 0_level_0,section,article
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
CNBC,634,191185
CNN,63,124659
Economist,46,23050
Fox News,670,20130
Gizmodo,78,18214
New Yorker,1,4644
People,35,133766
Reuters,224,734147
The New York Times,3774,240107
The Verge,148,50201


In [6]:
#check each pablication
set(df_data[df_data['publication'] == "Wired"]['section'])

{'article',
 'artificial-intelligence',
 'backchannel',
 'business',
 'culture',
 'deals',
 'design',
 'environment',
 'gadget-lab-podcast',
 'gadgetlab',
 'gear',
 'ideas',
 'magazine',
 'music',
 'national-affairs',
 'opinion',
 'outdoor',
 'phones',
 'photo',
 'physics-math',
 'privacy',
 'reviews',
 'science',
 'security',
 'social-media',
 'transportation',
 'trends',
 'uncategorized'}

In [7]:
# Select only publications with more than 10 sections and less than 100
df=df[(df['section'] > 10) & (df['section'] < 100)]
print(df.sum())
df

section       250
article    319746
dtype: int64


Unnamed: 0_level_0,section,article
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
CNN,63,124659
Economist,46,23050
Gizmodo,78,18214
People,35,133766
Wired,28,20057


In [8]:
selected_publications = list(df.index)
selected_publications

['CNN', 'Economist', 'Gizmodo', 'People', 'Wired']

In [9]:
df_data = df_data[df_data['publication'].isin(selected_publications)]
df_data.shape

(319746, 12)

In [10]:
#clean text
df_data['article'] = df_data['article'].str.replace(r"[^A-Za-z0-9//-/.,!?:; ]",'', regex=True)

#select texts that have at least 500 but no more than 10000 symbols
df_data['text_length'] = df_data['article'].fillna("").apply(len)
df_data = df_data[df_data['text_length'] >= 500]
df_data = df_data[df_data['text_length'] < 10000]

# cut text to have no more than 1500 symbols
df_data['article'] = df_data['article'].str[:1500]

In [11]:
df_data = df_data.reset_index(drop=False)

In [12]:
# split on the data sub-samples of 50,000 records each 
# and save for next steps
k = 0
batch_size = 50000

for k in range(7):
    df_part = df_data.loc[k * batch_size: (k+1) * batch_size,:]
    print(k, len(df_part), df_part.index[0])
    with open(output_folder + 'data_part_'+str(k)+'.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(df_part, f, pickle.HIGHEST_PROTOCOL)

0 50001 0
1 50001 50000
2 50001 100000
3 50001 150000
4 50001 200000
5 50001 250000
6 6365 300000


# Selecting first few paragraphs (10 sentences)

In [3]:
for k in range(5,7,1):
    file_name = 'data_part_'+str(k)+'.pickle'

    # load data
    with open(output_folder + file_name, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        df_data = pickle.load(f)

    #get spaCy doc
    print(k)
    %time df_data['spacy_doc'] = df_data['article'].apply(lambda x: nlp(x))
    print("="*50)

    #delete text of article
    del df_data['article']
    
    #select first 10 sentenses
    df_data['first_10_sents'] = df_data['spacy_doc'].apply(lambda doc: list(doc.sents)[:10])
    df_data['first_10_sents'] = df_data['first_10_sents'].apply(lambda l: " ".join([s.text for s in l]))

    #save batch as pickle
    with open(output_folder + 'spacy_doc_' +str(k)+ '.pickle', 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(df_data, f, pickle.HIGHEST_PROTOCOL)

5
CPU times: user 18min 9s, sys: 3.79 s, total: 18min 13s
Wall time: 18min 14s
6
CPU times: user 2min 30s, sys: 1.14 s, total: 2min 32s
Wall time: 2min 32s


# Saving Test and Train data

In [14]:
list_dfs = []

for k in range(7):
    file_name = 'spacy_doc_' +str(k)+ '.pickle'
    print(file_name)

    # load data
    with open(output_folder + file_name, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        df_data = pickle.load(f)

    #delete 'spacy_doc' of article 
    #(it is used for LDA model but we need only text of first 10 sentenses for other models)
    del df_data['spacy_doc']
    
    list_dfs.append(df_data)

spacy_doc_0.pickle
spacy_doc_1.pickle
spacy_doc_2.pickle
spacy_doc_3.pickle
spacy_doc_4.pickle
spacy_doc_5.pickle
spacy_doc_6.pickle


In [16]:
df_data = pd.concat(list_dfs)
print(df_data.shape)
df_data.columns

(306371, 14)


Index(['index', 'Unnamed: 0', 'Unnamed: 0.1', 'date', 'year', 'month', 'day',
       'author', 'title', 'url', 'section', 'publication', 'text_length',
       'first_10_sents'],
      dtype='object')

In [19]:
#Save Test data (CNN articles)
df_test = df_data[df_data['publication'] == "CNN"]
print(df_test.shape)
df_test[["date", 'author', 
         'title', 'url', 
         'section', 'publication',
         'first_10_sents']].to_csv(output_folder + "test.tsv", index=False, sep="\t")

(120324, 14)


In [20]:
#Save Train data (all but CNN articles)
df_train = df_data[df_data['publication'] != "CNN"]
print(df_train.shape)
df_train[["date", 'author', 
         'title', 'url', 
         'section', 'publication',
         'first_10_sents']].to_csv(output_folder + "train.tsv", index=False, sep="\t")

(186047, 14)


In [23]:
#publicaztions in Train dataset
print("Train data articles by publications:")
df_train.groupby('publication')['first_10_sents'].count()

Train data articles by publications:


publication
Economist     21613
Gizmodo       17458
People       130726
Wired         16250
Name: first_10_sents, dtype: int64