In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def json_to_str(file_name):
    return pd.read_json(file_name)['results'][1][0]['transcript']

In [7]:
txt = json_to_str('dsc10_1008.dms')

In [5]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [67]:
freq_table = _create_frequency_table(txt)
freq_table
sorted(freq_table.items(), key=lambda x: x[1], reverse=True)

[(',', 687),
 ('.', 609),
 ('right', 213),
 ("'s", 207),
 ('I', 204),
 ('?', 183),
 ('So', 169),
 ('thi', 160),
 ('like', 152),
 ('tabl', 106),
 ('column', 85),
 ("n't", 83),
 ('true', 65),
 ('one', 61),
 ('fals', 60),
 ('give', 56),
 ('It', 53),
 ('want', 53),
 ('number', 49),
 ('use', 47),
 ('get', 45),
 ('make', 45),
 ('okay', 44),
 ('na', 44),
 ('gon', 42),
 ("'m", 42),
 ('array', 42),
 ('becaus', 41),
 ('thing', 40),
 ('dot', 39),
 ('sort', 39),
 ('Um', 37),
 ('valu', 37),
 ('equal', 37),
 ('salari', 36),
 ('list', 34),
 ("'re", 32),
 ('type', 32),
 ('wa', 31),
 ('player', 31),
 ('go', 30),
 ('data', 29),
 ('could', 28),
 ('entri', 28),
 ('name', 28),
 ('let', 28),
 ('differ', 27),
 ('know', 26),
 ('someth', 26),
 ('um', 25),
 ('look', 24),
 ('command', 24),
 ('argument', 24),
 ('take', 24),
 ('three', 24),
 ('posit', 24),
 ('back', 23),
 ('way', 23),
 ("'ll", 23),
 ('row', 22),
 ('need', 22),
 ('question', 22),
 ('tri', 22),
 ('would', 21),
 ('call', 21),
 ('doe', 20),
 ('see', 2

In [38]:
sentences = sent_tokenize(txt)

In [73]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

In [74]:
def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = int(sumValues / len(sentenceValue))

    return average

In [75]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [77]:
sentence_scores = _score_sentences(sentences, freq_table)
sort = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)

In [78]:
sort

[('Right.', 815),
 ('Yeah.', 602),
 ('Yes.', 601),
 ('True.', 592),
 ('Good.', 567),
 ('Great.', 539),
 ('$116,197,6', 438),
 ('Right?', 395),
 ('All right.', 395),
 ('Okay, good', 345),
 ('Okay.', 326),
 ('Bye.', 316),
 ('No.', 310),
 ('Zero.', 309),
 ('Ah.', 309),
 ('Really.', 305),
 ('Oh, all ri', 301),
 ('Two 1,000,', 300),
 ("It's this ", 295),
 ('Whatever r', 280),
 ('Right agai', 277),
 ('Right here', 274),
 ('Yeah, You ', 266),
 ('Warm, fals', 237),
 ('This last ', 229),
 ('Numb roses', 227),
 ('Um okay.', 226),
 ('Okay, so b', 225),
 ('Too true.', 225),
 ('Okay, here', 224),
 ('Um, you ca', 224),
 ("It's false", 220),
 ('Soar three', 212),
 ('Shooting g', 211),
 ('Next time.', 211),
 ("C'est cent", 209),
 ('In general', 209),
 ("That's thi", 207),
 ('Some metho', 205),
 ('Same colum', 204),
 ("It's typic", 204),
 ('So there.', 203),
 ('Well, that', 202),
 ("And that's", 201),
 ('So false o', 201),
 ('Within you', 197),
 ('Okay, this', 197),
 ('This is in', 197),
 ("Okay, we'l"

In [79]:
sentence_scores = _score_sentences(sentences, freq_table)
sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)

[('Right.', 815),
 ('Yeah.', 602),
 ('Yes.', 601),
 ('True.', 592),
 ('Good.', 567),
 ('Great.', 539),
 ('$116,197,6', 438),
 ('Right?', 395),
 ('All right.', 395),
 ('Okay, good', 345),
 ('Okay.', 326),
 ('Bye.', 316),
 ('No.', 310),
 ('Zero.', 309),
 ('Ah.', 309),
 ('Really.', 305),
 ('Oh, all ri', 301),
 ('Two 1,000,', 300),
 ("It's this ", 295),
 ('Whatever r', 280),
 ('Right agai', 277),
 ('Right here', 274),
 ('Yeah, You ', 266),
 ('Warm, fals', 237),
 ('This last ', 229),
 ('Numb roses', 227),
 ('Um okay.', 226),
 ('Okay, so b', 225),
 ('Too true.', 225),
 ('Okay, here', 224),
 ('Um, you ca', 224),
 ("It's false", 220),
 ('Soar three', 212),
 ('Shooting g', 211),
 ('Next time.', 211),
 ("C'est cent", 209),
 ('In general', 209),
 ("That's thi", 207),
 ('Some metho', 205),
 ('Same colum', 204),
 ("It's typic", 204),
 ('So there.', 203),
 ('Well, that', 202),
 ("And that's", 201),
 ('So false o', 201),
 ('Within you', 197),
 ('Okay, this', 197),
 ('This is in', 197),
 ("Okay, we'l"

In [80]:
threshold = _find_average_score(sentence_scores)
threshold

105

In [85]:
summary = _generate_summary(sentences, sentence_scores, 1.8 * threshold)

In [86]:
summary

" Within your giant. Yeah. Numb roses. Right? Right. Right? Um okay. Bye. All right. And that's this. That's this command. All right. Right? It's this number 145,000. Zero. Great. Right. Okay, here's a discussion question. Oh, all right. No. Okay, so be so. This last column, right? Yes. Yeah. Right? C'est center. Shooting guard. Yes. In general. Great. Okay, this is one of these. Right? Right? Some method. What? Two 1,000,000,000? Ah. $116,197,639. Right. Same column, 15 16 salary. Right? It's typically not descending. It's typically ascending. What? Right? True. True. Well, that didn't happen. This is interesting. Right? Yes. Yeah. Okay. So that was ah, unusual. Right here. Soar three. Good. Right? Right. Right? Right? Good. All right. Right? Okay, here's another clicker question. Yeah. Right. Right? Right? Right. Right? Whatever right. Yes. Okay, good. Right? Right? Right? Too true. It's sunny, but not warm. Yes. Warm, false, humid. True. What? What? It's false. So false or true, Tha