In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json
import numpy as np
from icecream import ic

stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocessed(sentence):
    return [ps.stem(word) for word in sentence.split() if (word.isalnum() and word not in stopwords)]


def solve(data_raw):
    data = json.loads(data_raw)['text']

    sentences = []
    words = []
    for sentence in data:
        sentence_preprocessed = preprocessed(sentence)
        sentences.append(sentence_preprocessed)
        words.extend(sentence_preprocessed)
    n_sentences = len(sentences)
    n_words = len(words)

    tf_score = {}
    for word in words:
        if word in tf_score:
            tf_score[word] += 1
        else:
            tf_score[word] = 1
    tf_score.update((x, y / n_words) for x, y in tf_score.items())

    idf_score = {}
    for word in words:
        if not word in idf_score:
            idf_score[word] = [word in sentence for sentence in sentences].count(True)
    idf_score.update((x, np.log(n_sentences / y)) for x, y in idf_score.items())

    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}

    sentence_score = [ (np.sum([tf_idf_score[word] for word in sentence]) / len(sentence) if len(sentence) > 0 else 0) for sentence in sentences ]

    sentence_indices_sorted = np.argsort(sentence_score)[::-1]

    results = [0] * n_sentences

    compression_ratio = 0.25
    n_sentences_compressed = int(n_sentences * compression_ratio)

    for i in sentence_indices_sorted[:n_sentences_compressed]:
        results[i] = 1

    return results

In [None]:
import numpy as np

dir_test_data = './data/private/val.jsonl'

results = []

with open(dir_test_data, 'r') as f:
    n_line = 0
    while True:
        line = f.readline()
        if not line:
            break
        n_line += 1

        result = solve(line)
        results.append(result)

        if n_line == -1:
            break

results_np = np.array(results, dtype=object)
np.save('pred.npy', results_np)