In [1]:
import json
import os
import pandas as pd
import numpy as np

from ast import literal_eval
from utilities.utils import dict2w
from utilities.functions import keep_keys

# Part one, csv cleaning

In [2]:
# read dirty csv file.
topic_df = pd.read_csv('data/stories.csv')

# remove records with no transcripts
topic_df = topic_df.where(topic_df['body'] != ' ').dropna()


In [3]:
# create a translation dictionary for the topics
# this is only for easier understanding in future references
topics = topic_df['topic'].tolist()

# initialize lists that we are gonna use
exceptions = []
topic_set = set()  # we use sets to enforce uniqueness
clean_topics_temp = []

for idx, tpc in enumerate(topics):
    temp = []

    # add each id to the set and to the topics list
    try:
        topic_list = literal_eval(tpc)
        for topic in topic_list:
            temp.append(topic)
            topic_set.add(topic)
    except:
        # managing exceptions
        exceptions.append(idx)

    # append all topic ids to a list
    clean_topics_temp.append(temp)

# create and save the id translation dictionary
topics_unique = list(sorted(topic_set))
topic_IDs = dict2w()

for idx, topic in enumerate(topics_unique):
    topic_IDs[idx] = topic

with open("data/topic_ids.json", "w") as outfile:
    json.dump(topic_IDs, outfile, indent=4)


In [4]:
# check if there's any record with problematic topic
print(exceptions)

# since there wasn't any, there's no need to handle it


[]


In [5]:
# reproduce the 'topic' feature of the dataset
num_topics = len(topic_IDs)
clean_topics = []
for tpc in clean_topics_temp:
    temp = [topic_IDs[t] for t in tpc]
    cleaned_topic = np.zeros(num_topics, dtype=np.int16)
    for t in temp:
        cleaned_topic[int(t)] = 1
    clean_topics.append(list(cleaned_topic))


In [6]:
# reproduce, save and print the dataset
clean_df = {'body': topic_df['body'].tolist(),
            'topic': clean_topics}
clean_topics_df = pd.DataFrame(clean_df)
clean_topics_df.to_csv('data/stories_clean.csv', index=False)
print(clean_topics_df.head(10))


                                                body  \
0  hello and welcome to BBC News a woman who gave...   
1  news now out of North Hollywood. A 14 yearold ...   
2  homelessness his city's greatest failure. That...   
3  Minneapolis police officer Kim Potter guilty o...   
4  Judy an update now to the wildfires that wiped...   
5  the Sierra Nevada. Makes you want to cozy up u...   
6  proposed emergency declaration for San Francis...   
7  Holmes will not reach a verdict before Christm...   
8  year is the place to be. There's already a lot...   
9  that. In South Africa today, a farewell to a g...   

                                           topic  
0  [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]  
1  [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]  
2  [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
3  [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]  
4  [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]  
5  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]  
6  [0, 0, 0, 0, 1, 1, 0, 0

# Part two, News Cleaning

In [7]:
# find all the transcripts
transcripts_dir = 'data/transcripts'
transcripts_names = os.listdir(transcripts_dir)

for t in transcripts_names:
    if t[-5:] != '.json' or t == 'transcripts.json':
        transcripts_names.remove(t)

# initialize the transcripts dictionary
transcripts = {}

# iterate over transcript file
for trspt in transcripts_names:
    # find the transcript's ID
    t_id = trspt[:-5]

    # read the transcript
    file_dir = os.path.join(transcripts_dir, trspt)
    with open(file_dir, 'r') as infile:
        transcript = json.load(infile)

    # remove any extra information
    transcript = keep_keys(transcript, ['text', 'words'])

    # add the new transcript to the dictionary
    transcripts[t_id] = transcript

# save the transcripts
transcript_name = 'transcripts.json'
with open(os.path.join(transcripts_dir, transcript_name), "w") as outfile:
    json.dump(transcripts, outfile, indent=4)
