In [10]:
import gzip
import pandas as pd

In [11]:
gz_file_path = 'meetingBank_styled.csv.gz'
csv_output_path = 'meetingBank_styled.csv'

# Step 1: Uncompress the .gz file
with gzip.open(gz_file_path, 'rt') as f_in:
    with open(csv_output_path, 'w', encoding='utf-8') as f_out:
        f_out.writelines(f_in)

# Step 2: Read the resulting CSV file (optional)
df = pd.read_csv(csv_output_path)


In [12]:
df.head()

Unnamed: 0,transcript,word_count,sentence_count,motion_count,avg_word_len,sentiment
0,transcript,1.0,1.0,0.0,10.0,0.0
1,Please refrain from profane or obscene speech....,27456.0,2207.0,28.0,4.3,0.13
2,An assessment has called out council bill 161 ...,3107.0,265.0,20.0,4.3,0.12
3,I Please close the voting. Announce the result...,620.0,50.0,5.0,4.3,0.02
4,Motion passes. Hey thank you very much. Now we...,989.0,59.0,3.0,4.8,0.09


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6893 entries, 0 to 6892
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transcript      6893 non-null   object 
 1   word_count      6893 non-null   float64
 2   sentence_count  6893 non-null   float64
 3   motion_count    6893 non-null   float64
 4   avg_word_len    6893 non-null   float64
 5   sentiment       6893 non-null   float64
dtypes: float64(5), object(1)
memory usage: 323.2+ KB


In [14]:
print(df.columns)

Index(['transcript', 'word_count', 'sentence_count', 'motion_count',
       'avg_word_len', 'sentiment'],
      dtype='object')


In [15]:
# ---- Install Dependencies ----
import sys
import subprocess
import nltk
from tqdm.notebook import tqdm

required_libs = ['summa', 'sumy', 'nltk', 'transformers>=4.18.0', 'sentencepiece']
for lib in required_libs:
    try:
        __import__(lib.split('==')[0])
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", lib],
                            stdout=subprocess.DEVNULL,
                            stderr=subprocess.DEVNULL)

nltk.download('punkt', quiet=True)

# ---- Optimized Imports ----
import pandas as pd
from summa import summarizer as textrank_summarize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import torch
from transformers import pipeline

# ---- Data Loading ----
try:
    df = pd.read_csv('meetingBank_styled.csv', nrows=500)
    # Ensure transcript column exists
    if 'transcript' not in df.columns:
        df['transcript'] = ''
except Exception:
    df = pd.DataFrame(columns=['transcript'])

# ---- Initialize Models ----
lex_summarizer = LexRankSummarizer()
bert_available = False

# ---- GPU Optimization ----
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    device = 0
else:
    device = -1

# ---- BERT Initialization ----
try:
    bertsum_pipe = pipeline(
        "summarization",
        model="sshleifer/distilbart-cnn-12-6",
        device=device,
        framework="pt",
        truncation=True
    )
    # Warm-up
    if device == 0:
        _ = bertsum_pipe("Warm up", max_length=30, min_length=10)
    bert_available = True
except Exception as e:
    print(f"BERT initialization failed: {str(e)}")

# ---- Pre-allocated Results ----
results_with_meta = [None] * len(df)
results_without_meta = [None] * len(df)

# ---- Optimized Processing Functions ----
def preprocess_text(text):
    """Faster text preprocessing"""
    if not isinstance(text, str):
        return ""
    text = text.strip()
    return text if len(text.split()) >= 3 else ""

def generate_textrank(text):
    text = preprocess_text(text)
    if not text:
        return ""
    try:
        return textrank_summarize.summarize(text, ratio=0.3) or text[:300] + "..."
    except Exception:
        return text[:300] + "..."

def generate_lexrank(text):
    text = preprocess_text(text)
    if not text:
        return ""
    try:
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        return ' '.join(str(s) for s in lex_summarizer(parser.document, 4)) or text[:300] + "..."
    except Exception:
        return text[:300] + "..."

def generate_bertsum(text):
    if not bert_available:
        return ""
    text = preprocess_text(text)
    if not text:
        return ""

    # Smart truncation
    words = text.split()[:800]
    text = ' '.join(words)

    try:
        summary = bertsum_pipe(
            text,
            max_length=130,
            min_length=30,
            do_sample=False,
            truncation=True,
            num_beams=4
        )
        return summary[0]['summary_text']
    except Exception:
        return "[BERT FAILED]"

# ---- Fixed Processing Function ----
def process_row(idx, row):
    # Access transcript correctly from named tuple
    transcript = row.transcript if hasattr(row, 'transcript') else ""

    # Get all summaries
    textrank = generate_textrank(transcript)
    lexrank = generate_lexrank(transcript)
    bertsum = generate_bertsum(transcript)

    # Convert named tuple to dict if needed
    row_dict = row._asdict() if hasattr(row, '_asdict') else {col: getattr(row, col) for col in df.columns}

    # Store results
    results_with_meta[idx] = {
        **row_dict,
        'textrank_summary': textrank,
        'lexrank_summary': lexrank,
        'bertsum_summary': bertsum
    }

    results_without_meta[idx] = {
        'transcript': transcript,
        'textrank_summary': textrank,
        'lexrank_summary': lexrank,
        'bertsum_summary': bertsum
    }

# ---- Main Processing Loop ----
with tqdm(total=len(df), desc="Processing rows") as pbar:
    for idx, row in enumerate(df.itertuples(index=False)):
        process_row(idx, row)
        pbar.update(1)

# ---- Save Results ----
pd.DataFrame([x for x in results_with_meta if x is not None]).to_csv('summaries_with_metadata.csv', index=False)
pd.DataFrame([x for x in results_without_meta if x is not None]).to_csv('summaries_without_metadata.csv', index=False)

Device set to use cuda:0
Your max_length is set to 30, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


Processing rows:   0%|          | 0/500 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [16]:
df1 = pd.read_csv('summaries_with_metadata.csv')
df2 = pd.read_csv('summaries_without_metadata.csv')

In [17]:
df1

Unnamed: 0,transcript,word_count,sentence_count,motion_count,avg_word_len,sentiment,textrank_summary,lexrank_summary,bertsum_summary
0,transcript,1.0,1.0,0.0,10.0,0.00,,,
1,Please refrain from profane or obscene speech....,27456.0,2207.0,28.0,4.3,0.13,Yes President Brooks I move that council bill ...,Please refrain from profane or obscene speech....,Councilwoman Gilmore will you please put Coun...
2,An assessment has called out council bill 161 ...,3107.0,265.0,20.0,4.3,0.12,An assessment has called out council bill 161 ...,An assessment has called out council bill 161 ...,Councilman Lopez makes motion to take bill 16...
3,I Please close the voting. Announce the result...,620.0,50.0,5.0,4.3,0.02,Nine Ice nine Ice Council Bill 153 has been or...,I Please close the voting. Announce the result...,Council Bill 153 has been ordered publish . F...
4,Motion passes. Hey thank you very much. Now we...,989.0,59.0,3.0,4.8,0.09,Communication from Vice Mayor Richardson recom...,Motion passes. Hey thank you very much. Now we...,The committee made some fairly substantive ch...
...,...,...,...,...,...,...,...,...,...
495,. Our last item on the agenda is proposed ordi...,5520.0,364.0,17.0,4.4,0.06,. Our last item on the agenda is proposed ordi...,. Our last item on the agenda is proposed ordi...,Proposed ordinance 2020 20180 proposed to the...
496,We're under ordinances. Item 16. Item 16. Comm...,4185.0,337.0,45.0,4.2,0.14,Read the first time and lead over to the next ...,We're under ordinances. Item 16. Item 16. Comm...,"Councilwoman Price: ""On this issue I think we..."
497,Behavioral health climate and mobility. Arts a...,190.0,11.0,0.0,4.8,0.10,This grant payment is made from the coronaviru...,Behavioral health climate and mobility. Arts a...,The city of Boston is authorized to accept an...
498,13 Eyes. 13 Eyes. Final Consideration of Counc...,228.0,32.0,6.0,4.7,0.05,Final Consideration of Council Bill 20 2-0003 ...,13 Eyes. 13 Eyes. Final Consideration of Counc...,Final Consideration of Council Bill 20 2-0003...


In [18]:
df2

Unnamed: 0,transcript,textrank_summary,lexrank_summary,bertsum_summary
0,transcript,,,
1,Please refrain from profane or obscene speech....,Yes President Brooks I move that council bill ...,Please refrain from profane or obscene speech....,Councilwoman Gilmore will you please put Coun...
2,An assessment has called out council bill 161 ...,An assessment has called out council bill 161 ...,An assessment has called out council bill 161 ...,Councilman Lopez makes motion to take bill 16...
3,I Please close the voting. Announce the result...,Nine Ice nine Ice Council Bill 153 has been or...,I Please close the voting. Announce the result...,Council Bill 153 has been ordered publish . F...
4,Motion passes. Hey thank you very much. Now we...,Communication from Vice Mayor Richardson recom...,Motion passes. Hey thank you very much. Now we...,The committee made some fairly substantive ch...
...,...,...,...,...
495,. Our last item on the agenda is proposed ordi...,. Our last item on the agenda is proposed ordi...,. Our last item on the agenda is proposed ordi...,Proposed ordinance 2020 20180 proposed to the...
496,We're under ordinances. Item 16. Item 16. Comm...,Read the first time and lead over to the next ...,We're under ordinances. Item 16. Item 16. Comm...,"Councilwoman Price: ""On this issue I think we..."
497,Behavioral health climate and mobility. Arts a...,This grant payment is made from the coronaviru...,Behavioral health climate and mobility. Arts a...,The city of Boston is authorized to accept an...
498,13 Eyes. 13 Eyes. Final Consideration of Counc...,Final Consideration of Council Bill 20 2-0003 ...,13 Eyes. 13 Eyes. Final Consideration of Counc...,Final Consideration of Council Bill 20 2-0003...
