# Text Summarization EDA

## 01. Install and Import Libraries

In [1]:
!pip install textblob plotly



In [3]:
import re                                                                 # This library allows us to clean text data
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.figure_factory as ff

from scipy.stats import gaussian_kde
from plotly.subplots import make_subplots
from textblob import TextBlob                                             # Fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer               # Identify most common terms in the corpus

In [4]:
# Configuring notebook
seed = 42

colormap = 'cividis'
template = 'plotly_dark'

## 02. Data Visualization

In [24]:
def display_feature_list(features, feature_type):
    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')


def describe_df(df):
    """
    This function prints some basic info on the dataset and
    sets global variables for feature lists.
    """

    global categorical_features, continuous_features, binary_features
    categorical_features = [
        col for col in df.columns if df[col].dtype == 'object']
    binary_features = [col for col in df.columns if df[col].nunique(
    ) <= 2 and df[col].dtype != 'object']
    continuous_features = [
        col for col in df.columns if df[col].dtype != 'object' and col not in binary_features]

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f'Missing Data: \n{df.isnull().sum()}')
    print(f'Duplicates: {df.duplicated().sum()}')
    print(f'Data Types: \n{df.dtypes}')

    display_feature_list(categorical_features, 'Categorical')
    display_feature_list(continuous_features, 'Continuous')
    display_feature_list(binary_features, 'Binary')

    display(df.head(3))

In [13]:
def histogram_boxplot(df, hist_color, box_color, height, width, legend, name):
    '''
    This function plots a Histogram and a Box Plot side by side

    Parameters:
    hist_color = The color of the histogram
    box_color = The color of the boxplots
    heigh and width = Image size
    legend = Either to display legend or not
    '''

    features = df.select_dtypes(include=[np.number]).columns.tolist()

    for feat in features:
        try:
            fig = make_subplots(
                rows=1,
                cols=2,
                subplot_titles=["Box Plot", "Histogram"],
                horizontal_spacing=0.2
            )

            density = gaussian_kde(df[feat])
            x_vals = np.linspace(min(df[feat]), max(df[feat]), 200)
            density_vals = density(x_vals)

            fig.add_trace(go.Scatter(x=x_vals, y=density_vals, mode='lines',
                                     fill='tozeroy', name="Density", line_color=hist_color), row=1, col=2)
            fig.add_trace(go.Box(
                y=df[feat], name="Box Plot", boxmean=True, line_color=box_color), row=1, col=1)

            fig.update_layout(title={'text': f'<b>{name} Word Count<br><sup><i>&nbsp;&nbsp;&nbsp;&nbsp;{feat}</i></sup></b>',
                                     'x': .025, 'xanchor': 'left'},
                              margin=dict(t=100),
                              showlegend=legend,
                              template=template,
                              # plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                              height=height, width=width
                              )

            fig.update_yaxes(title_text=f"<b>Words</b>",
                             row=1, col=1, showgrid=False)
            fig.update_xaxes(title_text="", row=1, col=1, showgrid=False)

            fig.update_yaxes(title_text="<b>Frequency</b>",
                             row=1, col=2, showgrid=False)
            fig.update_xaxes(title_text=f"<b>Words</b>",
                             row=1, col=2, showgrid=False)

            fig.show()

        except Exception as e:
            print(f"An error occurred: {e}")

In [14]:
def plot_correlation(df, title, subtitle, height, width, font_size):
    '''
    This function is resposible to plot a correlation map among features in the dataset.

    Parameters:
    height = Define height
    width = Define width
    font_size = Define the font size for the annotations
    '''
    corr = np.round(df.corr(numeric_only=True), 2)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    c_mask = np.where(~mask, corr, 100)

    c = []
    for i in c_mask.tolist()[1:]:
        c.append([x for x in i if x != 100])

    fig = ff.create_annotated_heatmap(z=c[::-1],
                                      x=corr.index.tolist()[:-1],
                                      y=corr.columns.tolist()[1:][::-1],
                                      colorscale=colormap)

    fig.update_layout(title={'text': f"<b>{title} Heatmap<br><sup>&nbsp;&nbsp;&nbsp;&nbsp;<i>{subtitle}</i></sup></b>",
                             'x': .025, 'xanchor': 'left', 'y': .95},
                      margin=dict(t=210, l=110),
                      yaxis=dict(autorange='reversed', showgrid=False),
                      xaxis=dict(showgrid=False),
                      template=template,
                      # plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                      height=height, width=width)

    fig.add_trace(go.Heatmap(z=c[::-1],
                             colorscale=colormap,
                             showscale=True,
                             visible=False))
    fig.data[1].visible = True

    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = font_size

    fig.show()

In [15]:
def compute_tfidf(df_column, ngram_range=(1,1), max_features=15):
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=ngram_range)
    x = vectorizer.fit_transform(df_column.fillna(''))
    df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
    return df_tfidfvect

In [16]:
# Loading data
splits = {'train': 'data/train-00000-of-00001.parquet',
          'test': 'data/test-00000-of-00001.parquet'}

train = pd.read_parquet("hf://datasets/FiscalNote/billsum/" + splits["train"])
test = pd.read_parquet("hf://datasets/FiscalNote/billsum/" + splits["test"])

In [25]:
# Extracting info on the training Dataframe
describe_df(train)


DataFrame shape: (18949, 3)
Missing Data: 
text       0
summary    0
title      0
dtype: int64
Duplicates: 0
Data Types: 
text       object
summary    object
title      object
dtype: object

Categorical Features: 
text, summary, title

Continuous Features: 
None

Binary Features: 
None


Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act


In [26]:
mask = train['text'].isnull()       # Creating mask with null dialogues
filtered_train = train[mask]        # Filtering dataframe
filtered_train                      # Visualizing

Unnamed: 0,text,summary,title


In [28]:
df_text_lenght = pd.DataFrame()                                                # Creating an empty dataframe

for feat in categorical_features:                                              # Iterating through features --> Text & Summary
    df_text_lenght[feat] = train[feat].apply(lambda x: len(str(x).split()))    # Counting words for each feature

# Plotting histogram-boxplot
histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Train Dataset')

In [30]:
def vectorize_and_plot(data, ngram_range, title, subtitle):
    vectorizer = TfidfVectorizer(max_features=15, stop_words='english', ngram_range=ngram_range)  # Top 15 terms
    x = vectorizer.fit_transform(data.fillna(''))
    df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
    plot_correlation(df_tfidfvect, title, subtitle, 800, 800, 12)

In [33]:
vectorize_and_plot(train['text'], (1, 1), 'Unigrams', 'Train - Text')

In [32]:
vectorize_and_plot(train['text'], (2, 2), 'Bigrams', 'Train - Text')

In [34]:
vectorize_and_plot(train['summary'], (2, 2), 'Bigrams', 'Train - Summary')

In [35]:
# Filtering dataset to see those containing the term 'revenue code' in the summary
filtered_train = train[train['summary'].str.contains('revenue code', case=False, na=False)]
filtered_train.head(2)

Unnamed: 0,text,summary,title
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
11,SECTION 1. SHORT TITLE.\n\n This Act may be...,Public Safety and Protection Investment Act of...,To amend the Internal Revenue Code of 1986 to ...


In [37]:
vectorize_and_plot(train['text'], (2, 2), 'Trigrams', 'Train - Summary')

In [39]:
df_text_lenght = pd.DataFrame()

for feat in categorical_features:
    df_text_lenght[feat] = test[feat].apply(lambda x: len(str(x).split()))

histogram_boxplot(df_text_lenght,'#89c2e0', '#d500ff', 600, 1000, True, 'Validation Dataset')

In [52]:
print(train['text'].iloc[0])

SECTION 1. LIABILITY OF BUSINESS ENTITIES PROVIDING USE OF FACILITIES 
              TO NONPROFIT ORGANIZATIONS.

    (a) Definitions.--In this section:
            (1) Business entity.--The term ``business entity'' means a 
        firm, corporation, association, partnership, consortium, joint 
        venture, or other form of enterprise.
            (2) Facility.--The term ``facility'' means any real 
        property, including any building, improvement, or appurtenance.
            (3) Gross negligence.--The term ``gross negligence'' means 
        voluntary and conscious conduct by a person with knowledge (at 
        the time of the conduct) that the conduct is likely to be 
        harmful to the health or well-being of another person.
            (4) Intentional misconduct.--The term ``intentional 
        misconduct'' means conduct by a person with knowledge (at the 
        time of the conduct) that the conduct is harmful to the health 
        or well-being of another perso

In [55]:
from IPython.display import HTML

html_content = train['summary'].iloc[0]
display(HTML(html_content))

### Count Tokens

In [60]:
import random
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def count_tokens(text):
    tokens = word_tokenize(text)
    return len(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
for i in range(10):
    i = i * 1000
    text = train['summary'].iloc[i]
    token_count = count_tokens(text)
    print(f"Index: {i} | Number of tokens in summary: {token_count}")

Index: 0 | Number of tokens in summary: 290
Index: 1000 | Number of tokens in summary: 236
Index: 2000 | Number of tokens in summary: 103
Index: 3000 | Number of tokens in summary: 95
Index: 4000 | Number of tokens in summary: 269
Index: 5000 | Number of tokens in summary: 36
Index: 6000 | Number of tokens in summary: 310
Index: 7000 | Number of tokens in summary: 65
Index: 8000 | Number of tokens in summary: 343
Index: 9000 | Number of tokens in summary: 115
