# TEAM SYNAPSE : SUBMISSION FOR AUTOMIN 2023

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# %cd drive/MyDrive/AutoMin
# !pip install -r requirements.txt

In [None]:
PREPROCESSED_DIR = "preprocessed_data"
OUTPUT_DIR = "minutes"

TRAIN_DIR = "train"
DEV_DIR = "dev"
TEST_DIR = "test"
TEST2_DIR = "test2"

MODEL_PATH = "models/bart_large_xsum_samsum"
MODEL_NAME = "facebook/bart-large-xsum"
CHECKPOINT = "checkpoint-5500"
PRETRAIN_DATASET = "samsum"
METRIC = "rouge"

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name())

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# torch.cuda.set_device(0)

In [None]:
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
import nltk
import re
import datetime
import json
import os

nltk.download('punkt')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
summarizer = pipeline("summarization", model=os.path.join(MODEL_PATH, CHECKPOINT), device=0)

In [None]:
def load_preprocessed_transcripts(file_name):
  with open(f"{file_name}.json", "r") as f:
    preprocessed_transcripts = json.load(f)

  return preprocessed_transcripts

In [None]:
cs_train_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "cs", TRAIN_DIR))
cs_dev_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "cs", DEV_DIR))
cs_test_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "cs", TEST_DIR))
cs_test2_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "cs", TEST2_DIR))

en_train_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "en", TRAIN_DIR))
en_dev_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "en", DEV_DIR))
en_test_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "en", TEST_DIR))
en_test2_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, "en", TEST2_DIR))

In [None]:
### CUSTOMIZED STRIP, REPLACE AND PREPROCESS FUNCTIONS ###

def stripp(string):
    list1=[]
    string = string.strip()
    list1[:0]=string
    idx = 0
    cnd = False
    for i in list1:
        if i.isalpha():
            cnd = True
            break
    if cnd:
        while list1[0].isalpha() == False:
            if idx+1 == len(string):
                break
            list1.remove(list1[0])
            idx+=1
        list1 = ''.join(list1)
    else:
        list1 = None

    return list1

def preprocess(ctx):
  ctx = ctx.replace(" '", "'")
  ctx = ctx.replace(" ,", ",")
  ctx = ctx.replace(" .", ".")
  ctx = ctx.replace(" ?", "?")
  ctx = ctx.replace("Ehmm", "")
  ctx = ctx.replace(" Ehm", "")
  ctx = ctx.replace(" mmm", "")
  ctx = ctx.replace(" hmm", "")
  ctx = ctx.replace(" uh", "")
  ctx = ctx.replace(" uh ,", "")
  ctx = ctx.replace(" uh .", "")
  ctx = ctx.replace(" um", "")
  ctx = ctx.replace(" um ,", "")
  ctx = ctx.replace(" um .", "")
  ctx = ctx.replace(" Uh", "")
  ctx = ctx.replace(" Uh ,", "")
  ctx = ctx.replace(" Uh .", "")
  ctx = ctx.replace(" Um", "")
  ctx = ctx.replace(" Um ,", "")
  ctx = ctx.replace(" Um .", "")
  ctx = ctx.replace("Uh", "")
  ctx = ctx.replace("Um", "")
  ctx = ctx.replace("Yeah", "")
  ctx = ctx.replace(" yeah", "")
  ctx = ctx.replace("Ehm, ", "")
  ctx = ctx.replace("Hmm, ", "")
  ctx = ctx.replace("Ehm. ", "")
  ctx = ctx.replace("Hmm. ", "")
  ctx = ctx.replace("Yeah", "")
  ctx = ctx.replace(" yeah", "")
  ctx = ctx.replace("Ehm", "")
  ctx = ctx.replace("Hmm", "")
  ctx = ctx.replace("Ehm", "")
  ctx = ctx.replace("Hmm", "")
  ctx = ctx.replace("Mhm", "")
  ctx = ctx.replace(" {disfmarker}", "")
  ctx = ctx.replace(" {vocalsound}", "")
  ctx = ctx.replace(" {gap}", "")
  ctx = ctx.replace("...", ".")
  ctx = ctx.replace("..", ".")
  ctx = ctx.replace(",,", ",")
  ctx = ctx.replace(",,", ",")
  ctx = ctx.replace(",.", "")
  ctx = ctx.replace(".,", ".")
  ctx = ctx.replace("  ", " ")
  ctx = ctx.replace("(", "")
  ctx = ctx.replace(")", "")
  ctx = ctx.replace("Person", "PERSON")
  ctx = ctx.replace("is going to", "will")
  ctx = ctx.replace("are going to", "will")
  ctx = ctx.replace("are discussing", "discussed")
  ctx = ctx.replace("discuss", "discussed")
  ctx = ctx.replace("are working", "worked")
  ctx = ctx.replace("is working", "worked")

  return ctx

def replacee(i):
  i = i.replace("do n't", "do not")
  i = i.replace("n't", "not")
  i = i.replace("it 's", "it is")
  i = i.replace(" 's", "")
  if i[0]+i[1] == "'s":
    i = i.replace("'s ", "")
  i = i.replace("wo n't", "won't")
  i = i.replace(" and", ",")
  i = i.replace(",,", ",")
  return i

def gen_tscs(length, transcripts, meeting_id=None):
  def split_line(line):
    splits = []

    line_ = line.split('.')
    split_ = len(line_)//2
    line1 = '. '.join(line_[0:split_]) + '.\n'
    line2 = role + ': ' + '. '.join(line_[split_:])

    for line in [line1, line2]:
      if len(tokenizer.encode(line)) >= length:
        splits += split_line(line)
      else:
        splits.append(line)

    return splits


  tscs_preprocessed = {}
  attendees = []

  for m_id, transcript in transcripts.items():
    if meeting_id is not None and m_id != meeting_id:
      continue

    roles = transcript['roles']
    attendees.append(sorted(list(set(roles))))
    utterances = transcript['utterances']
    tsc = ['']
    i=0
    for role, utterance in zip(roles, utterances):
      utterance = preprocess(utterance)
      v = re.sub(r"[^a-zA-Z0-9]+", ' ', utterance)
      v = v.split(' ')
      if len(v)<=4:
        continue
      if len(v)>4 and len(v)<7 and 's' in v:
        continue
      utterance = stripp(utterance)
      if utterance == None:
        continue
      if len(utterance) == 1:
        continue
      line = role + ': ' + utterance + '\n'

      # IF DIALOGUE IS LONGER THAN "length"
      tokenized_line = tokenizer.encode(line)
      if len(tokenized_line)>=length:
          line_splits = split_line(line)
      else:
          line_splits = [line]

      for l in line_splits:
          tokenized = tokenizer.encode(tsc[i]+l)
          if len(tokenized)>=length:
              i+=1
              tsc.append('')
              tsc[i]+=l
          else:
              tsc[i]+=l

    tscs = {m_id: tsc}
    tscs_preprocessed.update(tscs)

  return tscs_preprocessed, attendees

In [None]:
### IF INFERENCING ON A SPECIFIC TRANSCRIPT, INPUT THE MEETING ID... ###
m_id = 'meeting_en_test_006'

### IMPLEMENTING THE BELOW LINES WILL GIVE 3 SUMMARIES WITH VARYING LENGTHS, AS MENTIONED ###
tscs_preprocessed_long, attendees = gen_tscs(512, en_test_preprocessed, meeting_id=m_id) #for longer summary
tscs_preprocessed_avg, attendees = gen_tscs(768, en_test_preprocessed, meeting_id=m_id)
tscs_preprocessed_short, attendees = gen_tscs(1024, en_test_preprocessed, meeting_id=m_id) #for shorter summary

print(tscs_preprocessed_short[m_id][0])

In [None]:
print(len(tscs_preprocessed_short[m_id]))
print(len(tscs_preprocessed_avg[m_id]))
print(len(tscs_preprocessed_long[m_id]))

In [None]:
# OVERVIEW THE SECTIONED BLOCKS OF CONVERSATIONS FROM THE TRANSCRIPT ...
for idx, i in enumerate(tscs_preprocessed_short[m_id]):
  print(f"{idx} - {i}")

In [None]:
### USEFUL UTIL FUNCTIONS FOR GENERATION AND FORMATTING ###
def summarize(tsc):
  a1 = summarizer(tsc)[0]['summary_text']
  return a1

def format_summary_with_pronouns(s2):
  s3 = ''.join(s2) #s2[0]

  s3 = s3.split('.')
  summ = ['']
  id=0
  summ1 = []
  for i in s3:
    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if type(i) == type(None):
      continue
    if i[0] == ' ':
      continue
    i = preprocess(i)
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P' and i[1] == 'E':
      summ1.append('-' + i + '.')
    else:
      summ1.append(i + '.')

  summ1 = insert_pronouns(summ1)
  for i in summ1:
    if i[1] == 'P' and i[2] == 'E':
      id+=1
      summ.append('')
      summ[id] = summ[id] + ' ' + i
    else:
      summ[id] = summ[id] + '\n  ' + i

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

### A FORMAT SUMMARY FUNCTION, WITHOUT PRONOUN INSERTION ###
def format_summary_without_pronouns(s2):
  s3 = ''.join(s2) #s2[0]

  s3 = s3.split('.')
  summ = ['']
  id=0

  for i in s3:
    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if type(i) == type(None):
      continue
    if i[0] == ' ':
      continue
    i = preprocess(i)
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P' and i[1] == 'E':
      id+=1
      summ.append('')
      summ[id] = summ[id] + ' -' + i + '.'
    else:
      summ[id] = summ[id] + '\n  ' + i + '.'

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

def insert_pronouns(summ1):
  len_sum = len(summ1)
  for line_no, i in enumerate(summ1):
    if '-' in i:
      if len_sum-line_no <= 3:
        rng = len_sum-line_no-1
      else:
        rng = 3
      for k1 in range(rng):
        st1, st2 = check_req(i, summ1[line_no+k1+1])
        if st1:
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace(st1, 'They')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They's", 'Their')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They is", 'They are')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They is", 'They are')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They has", 'They have')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They wants", 'They want')
  return summ1

def check_req(line1, line2):
  if ('-' in line1) and ('-' in line2):
    st1 = ''
    st2 = ''
    for _ in range(8):
      st1+=line1[_]
      st2+=line2[_]
    if st1 == st2:
      if line1[_+1] == line2[_+1]:
        if line1[_+1]==' ':
          st3 = st1
          st4 = st2
        elif line1[_+1]==',':
          st3 = False
          st4 = False
        else:
          st3 = st1+line1[_+1]
          st4 = st2+line2[_+1]
      else:
        if line1[_+1]=="'" or line2[_+1]=="'":
          st3 = st1
          st4 = st2
        else:
          st3 = False
          st4 = False
    else:
      st3 = False
      st4 = False
  else:
    st3 = False
    st4 = False

  return st3, st4

def gen_summaries(tscs_preprocessed):
  summaries = {}

  for k, v in tscs_preprocessed.items():
    if len(v) < 11:
      section = 2
    elif len(v) < 18:
      section = 4
    elif len(v) < 24:
      section = 6
    else:
      section = 8
    s1 = ['']
    tsc = v
    id=0
    for i, t1 in enumerate(tsc):
      a1 = summarize(t1)
      s1[id] = s1[id] + a1 + ' '
      if i%section==0:
        s1.append('')
        id+=1

    summaries[k] = s1

  return summaries

In [None]:
### THE BELOW 4 CELLS WOULD GIVE YOU 4 SUMMARIES VARYING IN LENGTH; ###
### THIS WOULD NORMALLY AFFECT THE COVERAGE AND ADEQUACY OF THE SUMMARIES; ###
### YOU CAN CHOOSE A SUITABLE SUMMARY FOR EVERY SINGLE TRANSCRIPT !!! ###

s2_short = gen_summaries(tscs_preprocessed_short)
print(format_summary_without_pronouns(s2_short[m_id]))

In [None]:
s2_avg = gen_summaries(tscs_preprocessed_avg)
print(format_summary_without_pronouns(s2_avg[m_id]))

In [None]:
s2_long = gen_summaries(tscs_preprocessed_long)
print(format_summary_without_pronouns(s2_long[m_id]))

In [None]:
s2_long[m_id][1]

# If we want to further shorten the obtained summary...
This method sacrifices gramaticality and readbility, in order to achieve compactness, by using NLTK stopword reduction over a general BART Summarization.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# RUN THE CELLS BELOW AND USE THIS FUNCTION INSTEAD OF THE 'format_summary()' version ...

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def shorten(example_sent):
  stop_words = set(stopwords.words('english'))
  stop_words.remove('to')
  stop_words.remove('of')
  stop_words.remove('from')
  stop_words.remove('as')
  stop_words.remove('has')
  stop_words.remove('do')
  stop_words.remove('not')
  #stop_words.remove('be')
  stop_words.remove('on')
  stop_words.remove('in')
  stop_words.remove('if')
  stop_words.remove('is')
  stop_words.remove('it')
  stop_words.remove('for')
  stop_words.remove('with')
  stop_words.remove('he')
  stop_words.remove('can')
  stop_words.remove('does')
  stop_words.remove('between')
  stop_words.add('They')
  stop_words.add('which')
  stop_words.add('On')
  stop_words.add('It')
  stop_words.add('The')
  stop_words.remove('over')
  stop_words.remove('until')
  stop_words.remove('after')
  stop_words.add('He')
  stop_words.remove('when')
  stop_words.remove('have')
  stop_words.remove('them')
  stop_words.remove('into')
  stop_words.remove('by')
  stop_words.remove('and')
  stop_words.remove('will')
  stop_words.remove('what')
  stop_words.add('manually')
  stop_words.remove('him')

  word_tokens = word_tokenize(example_sent)

  filtered_sentence = [w for w in word_tokens if w not in stop_words]

  return ' '.join(filtered_sentence)

def format_summary_short(s2):
  s3 = ''.join(s2) #s2[0]
  s3 = s3.split('.')
  summ = ['']
  id=0
  for i in s3:

    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if i[0] == ' ':
      continue
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P':
      id+=1
      summ.append('')
      i = shorten(i)
      i = replacee(i)
      summ[id] = summ[id] + ' -' + i + '.'
    else:
      i = shorten(i)
      i = replacee(i)
      summ[id] = summ[id] + '\n  ' + i + '.'

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [None]:
print(format_summary_short(s2_short[m_id]))

In [None]:
summary = ''
for s in s2_short[m_id]:
  preprocess_text = s.strip().replace("\n","")
  inputs = tokenizer(preprocess_text, return_tensors='pt')
  summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=512)
  output = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  summary = summary + output + ' '

In [None]:
summary

In [None]:
# Generate minutes
def format_minutes(attendees, minutes):
  tday = datetime.date.today()
  att = ", ".join(attendees[0])
  return f"DATE : {tday}\nATTENDEES : {att}\n\n\nSUMMARY-\n{minutes}\n\n\nMinuted by: Team Synapse"

def generate_minutes(preprocessed_transcripts, output_dir):
  for length in [512, 768, 1024]:
    split_transcripts, attendees = gen_tscs(length, preprocessed_transcripts)
    summaries = gen_summaries(split_transcripts)

    for meeting_id, summary in summaries.items():
      minutes_with_pronouns = format_summary_with_pronouns(summary)
      minutes_without_pronouns = format_summary_without_pronouns(summary)
      minutes_shorten = format_summary_short(summary)

      os.makedirs(os.path.join(output_dir, meeting_id), exist_ok=True)

      with open(os.path.join(output_dir, meeting_id, f"length_{length}_with_pronouns"), "w") as f:
        f.write(format_minutes(attendees, minutes_with_pronouns))

      with open(os.path.join(output_dir, meeting_id, f"length_{length}_without_pronouns"), "w") as f:
        f.write(format_minutes(attendees, minutes_without_pronouns))

      with open(os.path.join(output_dir, meeting_id, f"length_{length}_shorten"), "w") as f:
        f.write(format_minutes(attendees, minutes_shorten))

In [None]:
# generate_minutes(cs_train_preprocessed, os.path.join(OUTPUT_DIR, "cs", TRAIN_DIR))
# generate_minutes(cs_dev_preprocessed, os.path.join(OUTPUT_DIR, "cs", DEV_DIR))
# generate_minutes(cs_test_preprocessed, os.path.join(OUTPUT_DIR, "cs", TEST_DIR))
# generate_minutes(cs_test2_preprocessed, os.path.join(OUTPUT_DIR, "cs", TEST2_DIR))

generate_minutes(en_train_preprocessed, os.path.join(OUTPUT_DIR, "en", TRAIN_DIR))
generate_minutes(en_dev_preprocessed, os.path.join(OUTPUT_DIR, "en", DEV_DIR))
generate_minutes(en_test_preprocessed, os.path.join(OUTPUT_DIR, "en", TEST_DIR))
generate_minutes(en_test2_preprocessed, os.path.join(OUTPUT_DIR, "en", TEST2_DIR))

# TextRank Scipt for ranking sentences
This method uses GloVe Embeddings to calculate similarity score with the help of cosine similairty, and ranks individual sentences with the help of the PageRank Algorithm.

In [10]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import os
import math



[nltk_data] Downloading package punkt to /Users/michelle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip -d models/glove
!rm -rf glove*.zip

In [37]:
def get_minutes(data_path):
    minute_files = {}
    data_folders = [os.path.basename(x[0]) for x in os.walk(data_path) if len(os.path.basename(x[0])) > 0]
    for directory in sorted(data_folders):
        minute_files[directory] = {}
        for file_name in sorted(os.listdir(os.path.join(data_path, directory))):
            file_path = os.path.join(data_path, directory, file_name)
            with open(file_path, "r") as f:
                minute_files[directory][file_name] = f.read().splitlines()
    return minute_files

In [52]:
minute_path_train = '../automin2023/minutes/en/train/'
minute_path_dev = '../automin2023/minutes/en/dev/'
minute_path_test = '../automin2023/minutes/en/test/'
minute_path_test2 = '../automin2023/minutes/en/test2/'

output_minute_path_train = '../automin2023/minutes/en/train/final/'
output_minute_path_dev = '../automin2023/minutes/en/dev/final/'
output_minute_path_test = '../automin2023/minutes/en/final/'
output_minute_path_test2 = '../automin2023/minutes/en/test2/final/'


In [39]:
def get_sentence_vectors(row, word_embeddings):
    sentence_vector = None
    sentence = row['sentence']
    if len(sentence) != 0:
      sentence_vector = sum([word_embeddings.get(w, np.zeros((100,))) for w in sentence.split()])/(len(sentence.split())+0.001)
    else:
      sentence_vector = np.zeros((100,))
    return sentence_vector

def add_sentence_vectors(clean_sentences, word_embeddings):

  # EXTRACT SENTENCE VECTORS
  clean_sentences['sentence_vector'] = clean_sentences.apply(lambda x: get_sentence_vectors(x, word_embeddings), axis=1)
  return clean_sentences


def clean_minute_sentences(summary):

    # os.chdir(path)
    summaries = []
    # for file1 in sorted(os.listdir()):
    summary = summary[5:-3]
    text = ''
    for line in summary:
        line = line.replace(' -', '')
        line = line.replace('  ', '')
        line = line.replace('\n', '')
        text = text + line + ' '
    summaries.append(text)

    sentences = []
    for s in summaries:
        sentences.append(sent_tokenize(s))

    sentences = [(idx, y) for x in sentences for idx, y in enumerate(x)] # flatten list
    print('Total no. of sentences: ', len(sentences))

    # REMOVE PUNCTUATIONS, NUMBERS AND SPECIAL CHARACTERS
    clean_sentences = pd.DataFrame(sentences, columns = ['order', 'sentence'])


    # MAKE ALPHABETS TO LOWERCASE
    clean_sentences['sentence'] = clean_sentences['sentence'].str.replace("[^a-zA-Z]", " ").str.lower()

    # REMOVE STOPWORDS
    stop_words = stopwords.words('english')

    def remove_stopwords(sen):
        sen_new = " ".join([i for i in sen if i not in stop_words])
        return sen_new

    clean_sentences['sentence'] = clean_sentences['sentence'].apply(lambda x: remove_stopwords(x.split()))
    return sentences, clean_sentences



def calculate_similarity_and_rank(sentences, clean_sentences):
  # INITIALIZE A SIMILARITY MATRIX
  sim_mat = np.zeros([len(sentences), len(sentences)])

  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i != j:
        sim_mat[i][j] = cosine_similarity(clean_sentences['sentence_vector'][i].reshape(1,100), clean_sentences['sentence_vector'][j].reshape(1,100))[0,0]
        
  # PAGERANK SCORING
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)
  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

  # ENTER THE PERCENTAGE OF SENTENCES THAT SEEM UNIFORMATIONAL,  THIS NUMBER IS USUALLY AROUND ~15% FOR THE MINUTES BELONGING TO A LENGTHY TRANSCRIPT
  informative_sentences = []
  rem_perc = 0.15
  remove_count = math.ceil(len(sentences)*rem_perc)
  for i in range(len(ranked_sentences)-remove_count):
    informative_sentences.append(ranked_sentences[i][1])

  informative_sentences.sort(key=lambda sentence: sentence[1][0])
  informative_sentences = ['- ' + sentence[1] for sentence in informative_sentences]

  return '\n'.join(informative_sentences)

In [51]:
def rank_and_regenerate_minutes(input_path, output_path):
    # EXTRACT WORD VECTORS
    minute_files = get_minutes(input_path)

    word_embeddings = {}
    f = open('models/glove/glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    for min_id in minute_files.keys():
        print('Meeting ID: ', min_id)
        os.makedirs(output_path + min_id)
        for file_name, minutes in minute_files[min_id].items():
            sentences, clean_sentences = clean_minute_sentences(minutes)
            clean_sentences = add_sentence_vectors(clean_sentences, word_embeddings)
            informative_sentences = calculate_similarity_and_rank(sentences, clean_sentences)
            final_minutes =  '\n'.join(minutes[:5]) + informative_sentences + '\n'.join(minutes[-3:])

            with open(output_path + min_id + '/' + file_name + '_final.txt', 'w') as out_file:
                out_file.write(final_minutes)


In [50]:
rank_and_regenerate_minutes(minute_path_test, output_minute_path_test)

Total no. of sentences:  31
Total no. of sentences:  31
Total no. of sentences:  31
Total no. of sentences:  57
Total no. of sentences:  56
Total no. of sentences:  56
Total no. of sentences:  40
Total no. of sentences:  40
Total no. of sentences:  40
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  8
Total no. of sentences:  8
Total no. of sentences:  8
Total no. of sentences:  23
Total no. of sentences:  23
Total no. of sentences:  23


KeyboardInterrupt: 