In [1]:
import re
import pandas as pd

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [4]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [5]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
df = pd.read_pickle('raw_dataset.pkl')

In [7]:
df.columns = ['id',
 'case_title',
 'ponente',
 'year',
 'month',
 'decision_date',
 'gr_no',
 'case_link',
 'content',
 'has_pdf',
 'views',
 'downloads',
 'status',
 'modified_by',
 'created_at',
 'updated_at']

In [9]:
df.content

0        <center>\n<h2></h2>\n<h2>G.R. No. 91, November...
1        <center>\n<h2></h2>\n<h2>G.R. No. 94, October ...
2        <center>\n<h2></h2>\n<h2>G.R. No. 299, October...
3        <center>\n<h2></h2>\n<h2>G.R. No. 17, August 2...
4        <center>\n<h2></h2>\n<h2>G.R. No. 43, Septembe...
                               ...                        
62663    <center>\n<h2>THIRD DIVISION</h2>\n<h2>G.R. No...
62664    <center>\n<h2>THIRD DIVISION</h2>\n<h2>G.R. No...
62665    <center>\n<h2>SECOND DIVISION</h2>\n<h2>G.R. N...
62666    <center>\n<h2>SECOND DIVISION</h2>\n<h2>G.R. N...
62667    <center>\n<h2>THIRD DIVISION</h2>\n<h2>G.R. No...
Name: content, Length: 62668, dtype: object

In [10]:
# TODO: Use the predictions get similarities
#
# pred_df = pd.read_csv('tfidf_logreg_model_predictions.csv')

In [13]:
def remove_tags(text):
  TAG_RE = re.compile(r'<[^>]+>')
  return TAG_RE.sub('', text)

In [14]:
df['clean_content'] = df.content.apply(remove_tags)

In [31]:
df['clean_content'] = df['clean_content'].str.replace('\xa0',' ')

# print('Handling D E C I S I O N text...')
df['clean_content'] = df['clean_content'].apply(lambda x:  x.split("D E C I S I O N", maxsplit=1)[1] if re.search("D E C I S I O N", x) else x)

# print('Splitting on first colon...')
df['clean_content'] = df['clean_content'].apply(lambda x:  x.split(":", maxsplit=1)[1] if re.search(":", x) else x)

In [67]:
# additional remove footnotes
df['clean_content'] = df['clean_content'].apply(lambda x: re.sub(r"\[\d+\]", '', x))

In [68]:
ARTICLE_TO_SUMMARIZE = df.clean_content[61323]

In [69]:
ARTICLE_TO_SUMMARIZE

'Before the Court is a petition for review on certiorari assailing the Decision and Resolution dated April 20, 2012 and October 29, 2012, respectively, of the Court of Appeals (CA) in CA-G.R. CR No. 33353. The CA affirmed but modified only as to the penalty imposed and damages awarded the Judgment rendered on April 15, 2010 by the Regional Trial Court (RTC) of Bontoc, Mountain Province, Branch 36, in Criminal Case No. 2227, convicting Jester Mabunot (petitioner) of violation of Republic Act (R.A.) No. 7610, Article VI, Section 10(a).AntecedentsThe Information indicting the petitioner reads:That on or about Sept. 14, 2007, in the morning thereof, inside one of the classrooms at the Paracelis National High School, Butigue, Paracelis, Mountain Province, and within the jurisdiction of this Honorable Court, the [petitioner,] with intent to physically abuse and with cruelty, did then and there, wilfully, unlawfully and feloniously, box Shiva Baguiwan, a minor who is 14 years and 5 months old

In [70]:
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

In [71]:
summary_ids = model.generate(inputs['input_ids'], early_stopping=True)

In [72]:
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

['Jester Mabunot was convicted of beating Shiva Baguiwan, a 14-year-old girl. The incident took place in September 2007, when Shiva was 14 and the petitioner was 19. The petitioner dropped out from BNHS after the incident. The Court of Appeals affirmed but modified the penalty imposed and damages awarded.']


In [73]:
summary_ids

tensor([[    2,     0,   863,  8939,   256,   873,   879,  1242,    21,  3828,
             9,  4108, 39377, 13379,  3371,  6531,     6,    10,   501,    12,
           180,    12,   279,  1816,     4,    20,  1160,   362,   317,    11,
           772,  3010,     6,    77, 39377,    21,   501,     8,     5, 31390,
            21,   753,     4,    20, 31390,  1882,    66,    31,   163,   487,
          6391,    71,     5,  1160,     4,    20,   837,     9, 13248, 13935,
            53, 10639,     5,  2861,  5713,     8,  8357,  4241,     4,     2]])