# TEAM SYNAPSE : SUBMISSION FOR AUTOMIN 2023

In [112]:
# from google.colab import drive
# drive.mount('/content/drive')
#
# %cd drive/MyDrive/AutoMin
# !pip install -r requirements.txt

In [113]:
PREPROCESSED_DIR = "preprocessed_data"
OUTPUT_DIR = "minutes"

EUROPARL_DATA_PATH = "europarl/test1"
ELITR_DATA_PATH = "elitr/en/test2023-en"

# MODEL_SHORT_NAME = "bart-large-xsum"
MODEL_SHORT_NAME = "MEETING_SUMMARY"

# MODEL = f"facebook/{MODEL_SHORT_NAME}/"
MODEL = f"knkarthick/{MODEL_SHORT_NAME}"

# SUMMARIZER_MODEL = f"models/{MODEL_SHORT_NAME}/checkpoint-5500"
SUMMARIZER_MODEL = f"knkarthick/{MODEL_SHORT_NAME}"

In [114]:
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

import datetime
import json
import os
import re
import numpy as np
import pandas as pd
import networkx as nx
import math
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/kristyna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kristyna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [115]:
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name())
    torch.cuda.set_device(0)
else:
    print('No GPU available, using the CPU instead.')

No GPU available, using the CPU instead.


In [116]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
device = 0 if torch.cuda.is_available() else None
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=device)

In [117]:
def load_preprocessed_transcripts(file_name):
  with open(f"{file_name}.json", "r") as f:
    preprocessed_transcripts = json.load(f)

  return preprocessed_transcripts

In [118]:
elitr_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, ELITR_DATA_PATH))
europarl_preprocessed = load_preprocessed_transcripts(os.path.join(PREPROCESSED_DIR, EUROPARL_DATA_PATH))

In [168]:
def segment_transcript(max_input_length, transcript, tokenizer):
  def split_line(line, role, tokenizer):
    splits = []

    sentences = sent_tokenize(line)
    split_idx = len(sentences)//2
    line1 = " ".join(sentences[:split_idx]) + '.\n'
    line2 = role + ": " + " ".join(sentences[split_idx:])

    for line in [line1, line2]:
      if len(tokenizer.encode(line)) >= max_input_length:
        splits += split_line(line, role, tokenizer)
      else:
        splits.append(line)

    return splits

  roles = transcript['roles']
  attendees = sorted(list(set(roles)))
  utterances = transcript['utterances']
  segmented_transcript = [""]

  for role, utterance in zip(roles, utterances):
    line = role + ': ' + utterance + '\n'
    # TODO remove short lines?
    tokenized_line = tokenizer.encode(line)

    if len(tokenized_line)>=max_input_length:
        line_splits = split_line(line, role, tokenizer)
    else:
        line_splits = [line]

    for line_split in line_splits:
        tokenized = tokenizer.encode(segmented_transcript[-1]+line_split)
        if len(tokenized)>=max_input_length:
            segmented_transcript.append(line_split)
        else:
            segmented_transcript[-1] += line_split

  return segmented_transcript, attendees

In [157]:
m_id = 'meeting_en_test2023_001'

segmented_transcript_long, attendees_long = segment_transcript(512, elitr_preprocessed[m_id], tokenizer)
segmented_transcript_avg, _ = segment_transcript(768, elitr_preprocessed[m_id], tokenizer)
segmented_transcript_short, _ = segment_transcript(1024, elitr_preprocessed[m_id], tokenizer)

print(segmented_transcript_short[0])

PERSON6: Hi, hello. Can you hear me?
PERSON2: Yes.
PERSON6: I do not hear anyone. I have to reconnect again. Or maybe can someone else, give it a try as well, because –
PERSON2: I can hear you.
PERSON6: PERSON11, can you say something as well. Because with PERSON2, I remember that PERSON2 also had some microphone issues at times.
PERSON2: And you can hear – You cannot hear me or –
PERSON6: Yeah. So. I'll try to reconnect. I'll – Yeah – So, PERSON8. Hello, can you say something?
PERSON8: Oh yeah. Hi.
PERSON6: Oh, yeah. I. I have to reconnect. Making PERSON8 the host, for now. Yeah. So, I'll make PERSON8 the host for now, and I reconnect. Leave meeting.
PERSON8: Oh, okay. I hear you, by the way. Hello.
PERSON6: Yeah, yeah. Now I can hear it. So, for some reason I have always to connect a few times until zoom starts sending also the sound to me. So, It's annoying. Yeah. So. I was in a call until the very last minute. So, sorry that I didn't remind everybody to – to connect. And – I'm happ

In [134]:
print(len(segmented_transcript_short))
print(len(segmented_transcript_avg))
print(len(segmented_transcript_long))

12
17
26


In [135]:
def summarize(input_text, summarizer):
  summarization = summarizer(input_text)[0]["summary_text"].strip()
  return summarization

def generate_summary(segmented_transcript, summarizer):
  summarized_segments = [summarize(transcript_segment, summarizer) for transcript_segment in segmented_transcript]
  return " ".join(summarized_segments)

In [136]:
summary_short = generate_summary(segmented_transcript_short, summarizer)
summary_short

"PERSON6, PERSON8, PERSON11, PERSON2, PERSON1 and PERSON12 are on a conference call. They are discussing some technical issues with the system. PERSON11 is interested in doing offline subtitling for the upcoming sessions. PERSON6 would like to implement natural shortening in the model as well. PERSON6 and PERSON4 are preparing the setups for PROJECT2 and PROJECT4 is working on the Monday seminar model. PERSON6 advises to update the pipelines on Friday. PERSON8 is doing some ASR today. Some videos are not properly converted into a 16 c format and some audio files are not working properly. PERSON1 is going to be busy tomorrow and over the weekend, so today is the last chance for PERSON6 to get it running before the weekend. PERSON8 will ask PERSON10 if he can help. PERSON4 is not taking part in the call with PERSON1 as well. PERSON4 and PERSON2 are on a call. PERSON4 explains to PERSON2 how to create a custom word list and how to use it. PERSON2 is finishing training of a German ASR that

In [137]:
summary_avg = generate_summary(segmented_transcript_avg, summarizer)
summary_avg

Your max_length is set to 62, but you input_length is only 50. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)


"PERSON6, PERSON8, PERSON11, PERSON1 and PERSON12 are on a conference call. They are discussing technical issues with the call. PERSON6 supports the idea of training empty systems to do speech translation. PERSON11 is thinking about joining the call. There are no students in Edinburgh who would have the time to do that. The only other candidates would be, PERSON13 from the PROJECT1, who works halfway in Prague and halfway in Brno. The webpage for PROJECT2 PERSON6 suggests to have a dry run of some past similar talks to PROJECT2 talks and to do the same thing with the Supreme Audit talks. PERSON4 is afraid the files won't be ready till tomorrow. PERSON8 can operate two local machines, Ark and blackbird, PERSON8 is compiling some ASR files for the leader test set. Some of the videos are not properly converted into a 16 c format and some audio files are not working properly. PERSON6, PERSON8, PERSON1 and PERSON11 are working on a system evaluation. It's their last chance to get it running

In [138]:
summary_long = generate_summary(segmented_transcript_long, summarizer)
summary_long

"PERSON6 was in a call until the very last minute, so he didn't remind everybody to connect. His machine crashed and he had to restart it. Now he can hear people. The transcripts for check are almost done. The transcriber of Germany's working until December the 10th. PERSON1's mother is writing the question for the German subti- subtitle user study. PERSON6 is starting to write deliverables. PERSON11 has just added the transcripts for the The meme project does offline subtitling. Their mission is to create subtitles for the hearing-impaired. Their models work with sequence to sequence. They generate one word at a time. It would be possible to implement natural shortening in their models. PERSON11 is thinking about taking over the work of PERSON3, which is not ready for speech translation. PERSON6 wants an update for the sessions for the next week. PERSON4 will work on the webpage for PROJECT2 tomorrow. PERSON4 is away from 11 to 12 on Friday from ten to twelve. PERSON6, PERSON4 and PER

In [171]:
def fix_entities(text):
  text = re.sub(r"Person\s*(\d+)", lambda m: "PERSON" + m.group(1), text)
  text = re.sub(r"Organization\s*(\d+)", lambda m: "ORGANIZATION" + m.group(1), text)
  text = re.sub(r"Project\s*(\d+)", lambda m: "PROJECT" + m.group(1), text)
  text = re.sub(r"Location\s*(\d+)", lambda m: "LOCATION" + m.group(1), text)
  text = re.sub(r"Annotator\s*(\d+)", lambda m: "ANNOTATOR" + m.group(1), text)
  text = re.sub(r"Url\s*(\d+)", lambda m: "URL" + m.group(1), text)
  text = re.sub(r"Number\s*(\d+)", lambda m: "NUMBER" + m.group(1), text)
  text = re.sub(r"Password\s*(\d+)", lambda m: "PASSWORD" + m.group(1), text)
  text = re.sub(r"Phone\s*(\d+)", lambda m: "PHONE" + m.group(1), text)
  text = re.sub(r"Path\s*(\d+)", lambda m: "PATH" + m.group(1), text)
  text = re.sub(r"Path\s*(\d+)", lambda m: "PATH" + m.group(1), text)
  text = re.sub(r"Email\s*(\d+)", lambda m: "EMAIL" + m.group(1), text)
  text = re.sub(r"Other\s*(\d+)", lambda m: "OTHER" + m.group(1), text)

  return text

def create_minutes(summary):
  summary = fix_entities(summary)
  sentences = sent_tokenize(summary)
  minutes = "\n".join([f"- {sent}" for sent in sentences])

  return minutes

def format_minutes(attendees, minutes):
  tday = datetime.date.today()
  att = ", ".join(attendees)
  return f"DATE : {tday}\nATTENDEES : {att}\n\n\nSUMMARY\n{minutes}\n\n\nMinuted by: Team Synapse"

In [172]:
print(format_minutes(attendees_long, create_minutes(summary_long)))

DATE : 2023-05-02
ATTENDEES : PERSON1, PERSON10, PERSON11, PERSON2, PERSON4, PERSON6, PERSON7, PERSON8


SUMMARY-
- PERSON6 was in a call until the very last minute, so he didn't remind everybody to connect.
- His machine crashed and he had to restart it.
- Now he can hear people.
- The transcripts for check are almost done.
- The transcriber of Germany's working until December the 10th.
- PERSON1's mother is writing the question for the German subti- subtitle user study.
- PERSON6 is starting to write deliverables.
- PERSON11 has just added the transcripts for the The meme project does offline subtitling.
- Their mission is to create subtitles for the hearing-impaired.
- Their models work with sequence to sequence.
- They generate one word at a time.
- It would be possible to implement natural shortening in their models.
- PERSON11 is thinking about taking over the work of PERSON3, which is not ready for speech translation.
- PERSON6 wants an update for the sessions for the next week.

In [166]:
def generate_minutes(preprocessed_transcripts, output_dir, tokenizer, summarizer):
  for meeting_id, transcript in preprocessed_transcripts.items():
    for length in [512, 768, 1024]:
      segmented_transcript, attendees = segment_transcript(length, transcript, tokenizer)
      summary = generate_summary(segmented_transcript, summarizer)
      minutes = format_minutes(attendees, create_minutes(summary))

      os.makedirs(os.path.join(output_dir, meeting_id), exist_ok=True)
      with open(os.path.join(output_dir, meeting_id, f"length_{length}.txt"), "w") as f:
        f.write(minutes)

In [169]:
generate_minutes(elitr_preprocessed, os.path.join(OUTPUT_DIR, MODEL_SHORT_NAME, ELITR_DATA_PATH), tokenizer, summarizer)
generate_minutes(europarl_preprocessed, os.path.join(OUTPUT_DIR, MODEL_SHORT_NAME, EUROPARL_DATA_PATH), tokenizer, summarizer)

KeyboardInterrupt: 

# TextRank Scipt for ranking sentences
This method uses GloVe Embeddings to calculate similarity score with the help of cosine similairty, and ranks individual sentences with the help of the PageRank Algorithm.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip -d models/glove
!rm -rf glove*.zip

In [41]:
def get_minutes(data_path):
    minute_files = {}
    data_folders = [os.path.basename(x[0]) for x in os.walk(data_path) if len(os.path.basename(x[0])) > 0]
    for directory in sorted(data_folders):
        minute_files[directory] = {}
        for file_name in sorted(os.listdir(os.path.join(data_path, directory))):
            file_path = os.path.join(data_path, directory, file_name)
            with open(file_path, "r") as f:
                minute_files[directory][file_name] = f.read().splitlines()
    return minute_files

In [42]:
elitr_minute_path_test = '../automin2023/minutes/MEETING_SUMMARY/elitr/en/test2023-en/'
elitr_output_minute_path_test = '../automin2023/minutes/MEETING_SUMMARY/elitr/en/test2023-en/final/'

europarl_minute_path_test = '../automin2023/minutes/MEETING_SUMMARY/europarl/test1/'
europarl_output_minute_path_test = '../automin2023/minutes/MEETING_SUMMARY/europarl/test1/final/'

In [48]:
def get_sentence_vectors(row, word_embeddings):
    sentence_vector = None
    sentence = row['sentence']
    if len(sentence) != 0:
      sentence_vector = sum([word_embeddings.get(w, np.zeros((100,))) for w in sentence.split()])/(len(sentence.split())+0.001)
    else:
      sentence_vector = np.zeros((100,))
    return sentence_vector

def add_sentence_vectors(clean_sentences, word_embeddings):

  # EXTRACT SENTENCE VECTORS
  clean_sentences['sentence_vector'] = clean_sentences.apply(lambda x: get_sentence_vectors(x, word_embeddings), axis=1)
  return clean_sentences


def clean_minute_sentences(summary):

    # os.chdir(path)
    summaries = []
    # for file1 in sorted(os.listdir()):
    summary = summary[5:-3]
    text = ''
    for line in summary:
        line = line.replace(' -', '')
        line = line.replace('  ', '')
        line = line.replace('\n', '')
        text = text + line + ' '
    summaries.append(text)

    sentences = []
    for s in summaries:
        sentences.append(sent_tokenize(s))

    sentences = [(idx, y) for x in sentences for idx, y in enumerate(x)] # flatten list
    print('Total no. of sentences: ', len(sentences))

    # REMOVE PUNCTUATIONS, NUMBERS AND SPECIAL CHARACTERS
    clean_sentences = pd.DataFrame(sentences, columns = ['order', 'sentence'])


    # MAKE ALPHABETS TO LOWERCASE
    clean_sentences['sentence'] = clean_sentences['sentence'].str.replace("[^a-zA-Z]", " ").str.lower()

    # REMOVE STOPWORDS
    stop_words = stopwords.words('english')

    def remove_stopwords(sen):
        sen_new = " ".join([i for i in sen if i not in stop_words])
        return sen_new

    clean_sentences['sentence'] = clean_sentences['sentence'].apply(lambda x: remove_stopwords(x.split()))
    return sentences, clean_sentences



def calculate_similarity_and_rank(sentences, clean_sentences):
  # INITIALIZE A SIMILARITY MATRIX
  sim_mat = np.zeros([len(sentences), len(sentences)])

  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i != j:
        sim_mat[i][j] = cosine_similarity(clean_sentences['sentence_vector'][i].reshape(1,100), clean_sentences['sentence_vector'][j].reshape(1,100))[0,0]
        
  # PAGERANK SCORING
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)
  ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

  # ENTER THE PERCENTAGE OF SENTENCES THAT SEEM UNIFORMATIONAL,  THIS NUMBER IS USUALLY AROUND ~15% FOR THE MINUTES BELONGING TO A LENGTHY TRANSCRIPT
  informative_sentences = []
  rem_perc = 0.15
  remove_count = math.ceil(len(sentences)*rem_perc)
  final_sentence_count = len(ranked_sentences)
  if len(ranked_sentences)-remove_count > 5:
     final_sentence_count = len(ranked_sentences) - remove_count
  for i in range(final_sentence_count):
    informative_sentences.append(ranked_sentences[i][1])

  informative_sentences.sort(key=lambda sentence: sentence[1][0])
  informative_sentences = ['- ' + sentence[1] for sentence in informative_sentences]

  return '\n'.join(informative_sentences)

In [50]:
def rank_and_regenerate_minutes(input_path, output_path):
    # EXTRACT WORD VECTORS
    minute_files = get_minutes(input_path)
    word_embeddings = {}
    f = open('models/glove/glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    os.makedirs(output_path)
    for min_id in minute_files.keys():
        print('Meeting ID: ', min_id)
        os.makedirs(output_path + min_id)
        for file_name, minutes in minute_files[min_id].items():
            sentences, clean_sentences = clean_minute_sentences(minutes)
            clean_sentences = add_sentence_vectors(clean_sentences, word_embeddings)
            informative_sentences = calculate_similarity_and_rank(sentences, clean_sentences)
            final_minutes =  '\n'.join(minutes[:5]) + '\n' + informative_sentences + '\n'.join(minutes[-3:])

            with open(output_path + min_id + '/' + file_name + '_final.txt', 'w') as out_file:
                out_file.write(final_minutes)


In [51]:
rank_and_regenerate_minutes(elitr_minute_path_test, elitr_output_minute_path_test)

Meeting ID:  meeting_en_test2023_001
Total no. of sentences:  30
Total no. of sentences:  68
Total no. of sentences:  33
Meeting ID:  meeting_en_test2023_002
Total no. of sentences:  30
Total no. of sentences:  53
Total no. of sentences:  39
Meeting ID:  meeting_en_test2023_003
Total no. of sentences:  30
Total no. of sentences:  63
Total no. of sentences:  45
Meeting ID:  meeting_en_test2023_004
Total no. of sentences:  17
Total no. of sentences:  31
Total no. of sentences:  25
Meeting ID:  meeting_en_test2023_005
Total no. of sentences:  36
Total no. of sentences:  63
Total no. of sentences:  43
Meeting ID:  meeting_en_test2023_006
Total no. of sentences:  30
Total no. of sentences:  62
Total no. of sentences:  43
Meeting ID:  meeting_en_test2023_007
Total no. of sentences:  21
Total no. of sentences:  37
Total no. of sentences:  26
Meeting ID:  meeting_en_test2023_008
Total no. of sentences:  20
Total no. of sentences:  34
Total no. of sentences:  24
Meeting ID:  meeting_en_test2023

In [52]:
rank_and_regenerate_minutes(europarl_minute_path_test, europarl_output_minute_path_test)

Meeting ID:  2008-03-11-ch003-00
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Meeting ID:  2008-03-12-ch004-00
Total no. of sentences:  3
Total no. of sentences:  3
Total no. of sentences:  3
Meeting ID:  2008-03-12-ch012-00
Total no. of sentences:  7
Total no. of sentences:  21
Total no. of sentences:  17
Meeting ID:  2008-04-10-ch009-12
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Meeting ID:  2008-04-10-ch009-13
Total no. of sentences:  3
Total no. of sentences:  3
Total no. of sentences:  3
Meeting ID:  2008-04-10-ch010-00
Total no. of sentences:  2
Total no. of sentences:  2
Total no. of sentences:  2
Meeting ID:  2008-04-10-ch011-01
Total no. of sentences:  1
Total no. of sentences:  1
Total no. of sentences:  1
Meeting ID:  2008-04-10-ch011-02
Total no. of sentences:  3
Total no. of sentences:  3
Total no. of sentences:  3
Meeting ID:  2008-04-21-ch016-00
Total no. of sentences:  8
Total no. of sentences:  2