In [15]:
import torch
from torch.autograd import Variable
import torch.nn as nn


label_dict = {'politics_cleaned': 0,
            'science_cleaned': 1,
            'sports_cleaned': 2,
            'weather_cleaned': 3,
            'worldnews_cleaned': 4}

unmask_dict = {0: 'politics',
               1: 'science',
               2: 'sports',
               3: 'weather',
               4: 'worldnews'}

input_size = 164530 # from X_train.shape[1]
output_size = 5 # from len(set(y_train))


class DNNLog(torch.nn.Module):

    def __init__(self, input_size, output_size):
        super(DNNLog, self).__init__()
        self.linear = torch.nn.Linear(input_size, output_size)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

model = DNNLog(input_size, output_size)
checkpoint = torch.load('checkpoint.pth.tar')
model.load_state_dict(checkpoint['state_dict'])

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [2]:
import gensim.downloader as api


wv = api.load('word2vec-google-news-300')

In [3]:
import numpy as np
import pandas as pd

df = pd.read_csv('podcast_transcripts.csv')

In [4]:
df['filter_nonstr'] = df['transcript'].map(lambda x: 1 if type(x) == str else 0)
df = df[df['filter_nonstr'] == 1]
df['summary_sentence_length'] = df['summary'].map(lambda x: x.count('.'))
df['transcript_sentence_length'] = df['transcript'].map(lambda x: x.count('.'))

In [5]:
ft_size = 10

df['transcript_flat'] = df['transcript'].map(lambda x: x.replace('\n', ' ').split(' '))
df['summary_flat'] = df['summary'].map(lambda x: x.replace('\n', ' ').split(' '))

In [6]:
df['transcript_flat'] = df['transcript_flat'].map(lambda x: [y for y in x if y in wv])
df['summary_flat'] = df['summary_flat'].map(lambda x: [y for y in x if y in wv])

df['w2v_transcript'] = df['transcript_flat'].map(lambda x: [y[:ft_size] for y in wv[x]])
df['w2v_summary'] = df['summary_flat'].map(lambda x: [y[:ft_size] for y in wv[x]])

In [7]:
mx = input_size // 10 # for feature length

def pad(arr, padding):
    diff = padding - len(arr)
    arr = np.pad(arr, ((0, diff), (0, 0)), 'constant')
    return arr
    
df['w2v_transcript_padded'] = df['w2v_transcript'].map(lambda x: pad(x, mx))
df['w2v_transcript_padded'] = df['w2v_transcript_padded'].map(lambda x: x.ravel())

df['w2v_summary_padded'] = df['w2v_summary'].map(lambda x: pad(x, mx))
df['w2v_summary_padded'] = df['w2v_summary_padded'].map(lambda x: x.ravel())

In [10]:
summaries = np.array([x.reshape(-1, 1) for x in df['w2v_summary_padded']])
summaries = torch.from_numpy(np.asarray(summaries).astype(np.float16)).view(summaries.shape[0], summaries.shape[1])

transcripts = np.array([x.reshape(-1, 1) for x in df['w2v_transcript_padded']])
transcripts = torch.from_numpy(np.asarray(transcripts).astype(np.float16)).view(transcripts.shape[0], transcripts.shape[1])

In [12]:
pred_summary = model(summaries.float()).detach().numpy()
pred_transcript = model(transcripts.float()).detach().numpy()

In [1]:
df_summary = df[['title']]
df_transcript = df[['title']]

df_summary['summary_scores'] = list(pred_summary)
df_transcript['transcript_scores'] = list(pred_transcript)

for k in unmask_dict:
    df_summary[unmask_dict[k] + '_summary_score'] = [x[k] for x in pred_summary]
    
for k in unmask_dict:
    df_transcript[unmask_dict[k] + '_transcript_score'] = [x[k] for x in pred_transcript]
    
df_summary = df_summary.sort_values(by=['politics_summary_score', 'science_summary_score', 'sports_summary_score', 'weather_summary_score', 'worldnews_summary_score'], ascending=False)
df_summary['summary_rank'] = [x + 1 for x in range(len(df_summary))]
df_transcript = df_transcript.sort_values(by=['politics_transcript_score', 'science_transcript_score', 'sports_transcript_score', 'weather_transcript_score', 'worldnews_transcript_score'], ascending=False)
df_transcript['transcript_rank'] = [x + 1 for x in range(len(df_transcript))]

df_ranks = df_summary.merge(df_transcript, on=['title'])
df_ranks = df_ranks[['title', 'summary_rank', 'transcript_rank']]
df_ranks = df_ranks.sort_values(by=['title'])

In [36]:
# df_ranks.to_csv('podcast_ep_rank_summary_vs_transcript.csv', index=False)