In [1]:
import json
import torch

import numpy as np
import pandas as pd

from utilities.models import news_classifier
from utilities.functions import model_inference
from utilities.dataloaders import news_loader

# Phase one, filling body, start and end

In [2]:
# reading files
with open('data/transcripts/transcripts.json', 'r') as infile:
    transcripts = json.load(infile)

to_fill = pd.read_csv('data/to_fill.csv')
# lists that are going to be added to the dataframe
body = []
start = []
end = []

for _, label in to_fill.iterrows():
    # extract the news ID and its respective transcript
    transcript_id = str(label['source_video_id'])
    text = transcripts[transcript_id]['text']
    words = transcripts[transcript_id]['words']

    # calculate the indices of the beginning and end of the news
    body_start = text.find(label['first_words'])
    body_end = text.find(label['last_words']) + len(label['last_words'])

    # append the news body to the list
    body.append(text[body_start:body_end])

    # find the index of the first word of the body
    start_word_index = len(text[:body_start].split(' ')) - 1
    end_word_index = len(text[:body_end].split(' ')) - 1

    # append their start and finish time to respective lists
    start.append(words[start_word_index]['start'])
    end.append(words[start_word_index]['end'])


In [3]:
# add each feature to the dataframe
to_fill['body'] = body
to_fill['start'] = start
to_fill['end'] = end


# Phase two, filling topics

In [4]:
# initializing some constants
DEVICE = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
BATCH_SIZE = 50
NUM_CLASSES = 15

In [5]:
# loading the saved model
model_dir = './trained_models/news_classifier/best_model.pth'
model_saved = torch.load(model_dir)

model_dict = model_saved['model_dict']
model_dict['device'] = DEVICE

# initializing a model object, loading the weights
model = news_classifier(**model_dict)
model.load_state_dict(model_saved['state_dict'])

# loading the saved vocabulary
vocab = model_saved['vocab']

# creating a dataloader for inference
dataloder = news_loader(data=to_fill, batch_size=1, shuffle=False, vocab=vocab, inference=True)

# load the topic_IDs dictionary
with open('data/topic_ids.json', 'r') as infile:
    topic_ids = json.load(infile)

In [6]:
labels = []
for i, batch in enumerate(dataloder):
    batch_labels = model_inference(model, batch, DEVICE).tolist()[0]
    labels.append(batch_labels)

In [7]:
classes = []
for label in labels:
    class_list = np.nonzero(label)[0].tolist()
    for idx, topic in enumerate(class_list):
        class_list[idx] = topic_ids[str(topic)]
    classes.append(class_list)

to_fill['topics'] = classes

In [8]:
to_fill.to_csv('./data/filled.csv')
to_fill

Unnamed: 0,first_words,last_words,source_video_id,body,start,end,topics
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,465082,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 9632673..."
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387,a call. San Francisco firefighters rescued a m...,359020,359162,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 6fbf954..."
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,60922,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,505910,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,101238,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 6fbf954..."
5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ...",546494,547150,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da...",614910,615274,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...,419994,420182,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...,578612,578742,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...,619310,619638,"[0d817400-3f5d-41e0-929c-c31fdbe75d31, 39822b5..."
