In [73]:
import logging
import os
import numpy as np
import pickle
from tqdm import tqdm
import pandas as pd
import time
# import re

import spacy
import fitz  # PyMuPDF
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import entropy

import warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
from joblib import dump, load

from agents import getRandomAgent
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException


In [2]:
logging.basicConfig(filename="document_processing_errors.log", level=logging.INFO)


def load_from_pickle(pickle_path):
    with open(pickle_path, "rb") as f:
        df = pickle.load(f)
    return df



# Preprocessing

In [3]:
def pdf_to_text(path):
    try:
        doc = fitz.open(path)
        text = ""
        for page in doc:
            try:
                text += page.get_text()
            except Exception as page_error:
                print(f"Error extracting text from page in {path}: {page_error}")
                continue
                # Optionally, continue to the next page or log the error
        return text
    except Exception as e:
        logging.info(f"Error processing file {path}: {e}")
        return ""


In [4]:
tqdm.pandas()  # Enables progress_apply for pandas


In [5]:
def update_pickle(pickle_df, dataset_path, pickle_path, data_path):
    df = pd.read_csv(dataset_path, header=0)
    df["fname"] = data_path + df["fname"]

    # Filter for new entries not already in pickle_df
    new_entries = df[~df["fname"].isin(pickle_df["fname"])]

    # Process new entries
    if not new_entries.empty:
        new_entries["cleaned_text"] = new_entries["fname"].progress_apply(pdf_to_text)
        new_entries["tokenized_text"] = new_entries["cleaned_text"].progress_apply(
            clean_and_tokenize
        )

        # Append new entries to the original pickle_df
        updated_pickle_df = pd.concat([pickle_df, new_entries], ignore_index=True)
    else:
        updated_pickle_df = pickle_df  # No new entries to add

    # Save updated DataFrame
    updated_pickle_df.to_pickle(pickle_path)

    return updated_pickle_df


In [6]:
nlp = spacy.load("en_core_web_sm")  # Or a larger model as needed


def clean_and_tokenize(text, chunk_size=1000000):
    """
    Tokenizes the text using SpaCy, handling long texts by processing in chunks.

    :param text: The text to be tokenized.
    :param chunk_size: Maximum chunk size in characters.
    :return: A string of the lemmatized tokens.
    """
    # Check if the text length exceeds the chunk size
    if len(text) > chunk_size:
        # Initialize an empty list to store tokens from all chunks
        tokens_all_chunks = []

        # Process the text in chunks
        for start in range(0, len(text), chunk_size):
            end = start + chunk_size
            # Extract a chunk of text
            chunk = text[start:end]
            # Process the chunk
            doc = nlp(chunk)
            # Extract tokens, lemmatize, and filter as before
            tokens = [
                token.lemma_ for token in doc if token.is_alpha and not token.is_stop
            ]
            tokens_all_chunks.extend(tokens)

        # Combine tokens from all chunks and return
        return " ".join(tokens_all_chunks)
    else:
        # If text does not exceed the chunk size, process as before
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        return " ".join(tokens)


### Alternate batch processing

In [7]:
def clean_and_tokenize_chunk(chunk):
    """
    Tokenizes a single chunk of text.
    """
    doc = nlp(chunk)
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]


def batch_tokenize_texts(texts, batch_size=1000, chunk_size=1000000):
    """
    Tokenize a list of texts in batches, handling long texts by processing in chunks.

    :param texts: The list of texts to be tokenized.
    :param batch_size: Number of texts to process in a single batch.
    :param chunk_size: Maximum chunk size in characters for each text.
    :return: A list of lists, where each sublist contains the tokens of a text.
    """
    processed_texts = []
    for text in texts:
        # If the text is longer than chunk_size, split it into chunks
        if len(text) > chunk_size:
            tokens_all_chunks = []
            for start in range(0, len(text), chunk_size):
                end = start + chunk_size
                chunk = text[start:end]
                # Tokenize the chunk and extend the list of tokens
                tokens_all_chunks.extend(clean_and_tokenize_chunk(chunk))
            processed_texts.append(tokens_all_chunks)
        else:
            # For texts that don't exceed the chunk size, process as usual
            tokens = clean_and_tokenize_chunk(text)
            processed_texts.append(tokens)

    return processed_texts


# Implementation of preprocessing

In [8]:

dataset_path = "/uw/invest-data/classify_presentations/data/dataset.csv"
DATA_PATH = "/dave/presentations/"

dfpickle_path = "/dave/data/df.pkl"
force = False
if not os.path.exists(dfpickle_path) or force:
    print("Warning, is dave mounted?")
else:
    print("Going to load df from pickle file")


Going to load df from pickle file


In [9]:
if os.path.exists(dfpickle_path) and not force:
    df = pd.read_pickle(dfpickle_path)
else:
    df = pd.read_csv(dataset_path, header=0)
    df["fname"] = DATA_PATH + df["fname"]

    tqdm.pandas(desc="Processing documents")
    df["cleaned_text"] = df["fname"].progress_apply(pdf_to_text)

    df["tokenized_text"] = df["cleaned_text"].progress_apply(clean_and_tokenize)

# Only necessary when new data is added to the dataset
# update_pickle(df, dataset_path, dfpickle_path, DATA_PATH)



# Extract Features

In [87]:
def check_aspect_ratio_and_mix_feature(pdf_path):
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.info(f"Error opening PDF {pdf_path}: {e}")
        raise ValueError("Error opening PDF")
        return (
            [],
            0,
            False,
        )  # Returning False as the third value for no significant change

    aspect_ratios = []

    page_count = len(doc)

    for page in doc:
        rect = page.rect
        aspect_ratio = rect.width / rect.height
        aspect_ratios.append(aspect_ratio)

    # Detect significant changes in aspect ratio
    change_frequency = calculate_change_frequency(aspect_ratios)

    persistent_changes_raw, persistent_changes_frequency = detect_persistent_changes(
        aspect_ratios, change_frequency
    )
    num_changes, changes_significance = detect_significant_change(aspect_ratios)
    stats = extract_aspect_ratio_features(aspect_ratios)
    categories = categorize_aspect_ratios(aspect_ratios)

    return (
        aspect_ratios,
        page_count,
        persistent_changes_raw,
        persistent_changes_frequency,
        num_changes,
        changes_significance,
        stats,
        categories,
    )


def calculate_change_frequency(aspect_ratios, threshold=0.1):
    """
    Calculate the percentage of pages exceeding a given aspect ratio change threshold.

    :param aspect_ratios: List of aspect ratios for the document's pages.
    :param threshold: Threshold for considering a change in aspect ratio significant.
    :return: Percentage of page transitions that exceed the change threshold.
    """
    if len(aspect_ratios) < 2:
        # If there's only one page or none, there can't be any transitions
        return 0.0

    # Calculate the absolute percentage change in aspect ratio between consecutive pages
    changes = [
        abs(aspect_ratios[i] - aspect_ratios[i - 1]) / aspect_ratios[i - 1]
        for i in range(1, len(aspect_ratios))
    ]

    # Count how many changes exceed the threshold
    significant_changes = sum(change > threshold for change in changes)

    # Calculate the percentage of transitions that are significant
    change_frequency = (significant_changes / (len(aspect_ratios) - 1)) * 100

    return change_frequency


def detect_significant_change(aspect_ratios, change_threshold=0.1):
    """
    Detects the number of significant aspect ratio changes and assesses their significance.

    :param aspect_ratios: List of aspect ratios for each page in the document.
    :param change_threshold: Threshold for considering a change in aspect ratio significant, based on change_frequency.
    :return: A tuple containing the number of significant changes and an aggregate measure of their significance.
    """
    if len(aspect_ratios) < 2:
        # No significant changes can be detected in a single-page document
        return 0, 0.0

    # Calculate percentage changes between consecutive aspect ratios
    changes = np.abs(np.diff(aspect_ratios) / aspect_ratios[:-1])

    # Determine significant changes based on the threshold
    significant_changes_indices = np.where(changes > change_threshold)[0]
    num_significant_changes = len(significant_changes_indices)

    # Calculate the significance of changes as the sum of changes that exceed the threshold, normalized by the number of changes
    if num_significant_changes > 0:
        significance_of_changes = (
            np.sum(changes[significant_changes_indices]) / num_significant_changes
        )
    else:
        significance_of_changes = 0.0

    return num_significant_changes, significance_of_changes


def detect_persistent_changes(aspect_ratios, change_threshold=0.1):
    """
    Detects persistent changes in aspect ratios and adjusts the count based on change frequency.

    :param aspect_ratios: List of aspect ratios for each page in the document.
    :param change_threshold: Threshold for considering a change in aspect ratio significant.
    :return: A tuple containing the raw count of persistent changes and the count adjusted by change frequency.
    """
    if len(aspect_ratios) < 2:
        # If there's only one page or none, there can't be any transitions
        return 0, 0.0

    # Calculate the absolute percentage change in aspect ratio between consecutive pages
    changes = [
        abs(aspect_ratios[i] - aspect_ratios[i - 1]) / aspect_ratios[i - 1]
        for i in range(1, len(aspect_ratios))
    ]

    # Count the number of persistent changes
    persistent_changes_raw = 0
    current_persistence = 0

    for change in changes:
        if change > change_threshold:
            current_persistence += 1
        else:
            if (
                current_persistence > 1
            ):  # Assuming persistence means more than one consecutive change
                persistent_changes_raw += 1
            current_persistence = 0

    # Check if the last sequence of pages ends with persistent changes
    if current_persistence > 1:
        persistent_changes_raw += 1

    # Calculate change_frequency to adjust the persistent_changes_raw
    change_frequency = calculate_change_frequency(aspect_ratios, change_threshold)

    # Adjusting the raw count by change frequency to get a frequency-informed measure
    # This example simply scales the raw count by the change_frequency percentage; other methods could also be applied
    persistent_changes_frequency = persistent_changes_raw * (change_frequency / 100)

    return persistent_changes_raw, persistent_changes_frequency


def extract_aspect_ratio_features(aspect_ratios):
    """
    Extracts statistical features from a list of aspect ratios.

    :param aspect_ratios: List of aspect ratios for the document's pages.
    :return: Dictionary of statistical features.
    """
    if not aspect_ratios:  # Check if the list is empty
        return {"mean": 0, "std": 0, "min": 0, "max": 0}

    aspect_ratios_array = np.array(aspect_ratios)
    return {
        "mean": np.mean(aspect_ratios_array),
        "std": np.std(aspect_ratios_array),
        "min": np.min(aspect_ratios_array),
        "max": np.max(aspect_ratios_array),
    }


def categorize_aspect_ratios(aspect_ratios):
    """
    Categorizes aspect ratios as portrait, landscape, or square.

    :param aspect_ratios: List of aspect ratios for the document's pages.
    :return: List of categories corresponding to each aspect ratio.
    """
    categories = []
    for ar in aspect_ratios:
        if ar < 0.95:
            categories.append("portrait")
        elif ar > 1.05:
            categories.append("landscape")
        else:
            categories.append("square")
    return categories


def calculate_clustering_features(aspect_ratios, n_clusters=3):
    """
    Performs KMeans clustering on aspect ratios and calculates clustering-based features.

    :param aspect_ratios: List of aspect ratios for the document's pages.
    :param n_clusters: Number of clusters to form, default is 3.
    :return: A dictionary with the calculated features: CDC, CTI, CDS, and MCP.
    """
    # Ensure aspect_ratios is a 2D array for KMeans
    aspect_ratios = np.array(aspect_ratios).reshape(-1, 1)

    # Perform KMeans clustering
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ConvergenceWarning)
        
        # Perform KMeans clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(aspect_ratios)
    
    labels = kmeans.labels_

    # Calculate Cluster Diversity Count (CDC)
    unique_clusters = len(set(labels))

    # Calculate Cluster Transition Indicator (CTI)
    transitions = sum(1 for i in range(1, len(labels)) if labels[i] != labels[i - 1])

    # Calculate Cluster Distribution Spread (CDS) using entropy
    cluster_counts = np.bincount(labels, minlength=n_clusters)
    proportions = cluster_counts / np.sum(cluster_counts)
    spread = entropy(proportions)

    # Calculate Majority Cluster Proportion (MCP)
    majority_cluster_proportion = max(cluster_counts) / np.sum(cluster_counts)

    # Return the calculated features
    return {
        "CDC": unique_clusters,
        "CTI": transitions,
        "CDS": spread,
        "MCP": majority_cluster_proportion,
    }


def detect_outliers_z_score(aspect_ratios, threshold=2):
    aspect_ratios = np.array(aspect_ratios).flatten()  # Ensures aspect_ratios is 1D
    mean_ar = np.mean(aspect_ratios)
    std_ar = np.std(aspect_ratios)
    outliers = [
        i
        for i, ar in enumerate(aspect_ratios)
        if abs((ar - mean_ar) / std_ar) > threshold
    ]

    return outliers


np.seterr(divide="ignore", invalid="ignore")


def calculate_text_density(pdf_path):
    """
    Calculates text density (words per page) for a PDF document.

    :param pdf_path: Path to the PDF document.
    :return: List of text densities for each page.
    """
    doc = fitz.open(pdf_path)
    text_densities = []
    for page in doc:
        text = page.get_text("text")
        word_count = len(text.split())
        area = page.rect.width * page.rect.height
        text_density = word_count / area if area else 0
        text_densities.append(text_density)
    return text_densities


def correlate_text_density_aspect_ratio(aspect_ratios, text_densities):
    """
    Calculates the Pearson correlation coefficient between aspect ratios and text densities of a document's pages.

    :param aspect_ratios: List of aspect ratios for each page.
    :param text_densities: List of text densities for each page.
    :return: Pearson correlation coefficient, or NaN if the calculation is not possible.
    """
    if len(aspect_ratios) != len(text_densities) or len(aspect_ratios) < 2:
        return np.nan  # Ensures there are enough data points and both lists match

    return np.corrcoef(aspect_ratios, text_densities)[0, 1]


def calculate_text_density_variability(text_densities):
    return np.std(text_densities)


def calculate_text_density_by_position(text_densities):
    """
    Calculates the average text density for the beginning, middle, and end sections of a document.

    :param text_densities: List of text densities for each page.
    :return: A tuple with average text densities for the beginning, middle, and end of the document.
    """
    third = len(text_densities) // 3
    if third == 0:
        return (0, 0, 0)  # Avoid division by zero for very short documents

    beginning = np.mean(text_densities[:third])
    middle = np.mean(text_densities[third : 2 * third])
    end = np.mean(text_densities[2 * third :])

    return beginning, middle, end


# Function to check for keyword presence
def check_keywords(text, keyword_list):
    text = text.lower()
    return int(any(keyword in text for keyword in keyword_list))


def combine_tfidf_keyword(df):
    # Step 2: TF-IDF Calculation
    vectorizer = TfidfVectorizer(
        max_features=5000
    )  # Adjust number of features as needed
    tfidf_matrix = vectorizer.fit_transform(df["tokenized_text"])

    # Step 3. Combine keyword and tfidf features into a single matrix
    # Convert binary keyword matches to a matrix
    keyword_features = df[[col for col in df.columns if "_keyword" in col]].to_numpy()
    # Combine TF-IDF features with keyword binary indicators
    combined_features = np.hstack((tfidf_matrix.toarray(), keyword_features))

    # Now `combined_features` is ready for model training, and should be aligned with your labels.
    return combined_features


In [88]:
def combine_tfidf_keyword_additional_features(df, vectorizer=None):
    # Step 2: TF-IDF Calculation
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=5000)    
        tfidf_matrix = vectorizer.fit_transform(df["tokenized_text"])
    else:
        tfidf_matrix = vectorizer.transform(df["tokenized_text"]) 
    
    # Convert binary keyword matches to a matrix
    keyword_features = df[[col for col in df.columns if "_keyword" in col]].to_numpy()
    
    # Assuming new features are already in df and are numeric
    additional_features_columns = [
        "aspect_ratio_means", "aspect_ratio_std", "aspect_ratio_min", "aspect_ratio_max",
        "page_counts", "persistent_changes_raw", "persistent_changes_frequency", "num_changes",
        "changes_significance", "text_density_means", "text_density_correlations", "text_density_variability",
        "text_density_beginning", "text_density_middle", "text_density_end", "outliers_counts",
        "unique_cluster_lists", "cluster_transitions_lists", "cluster_spreads_lists",
        "majority_cluster_proportions", "portrait_count", "landscape_count", "square_count"
    ]
    additional_features = df[additional_features_columns].to_numpy()
    
    # Combine TF-IDF features with keyword binary indicators and the additional features
    combined_features = np.hstack((tfidf_matrix.toarray(), keyword_features, additional_features))
    
    return combined_features, vectorizer



def append_data_or_nan(a_list, data):
    try:
        a_list.append(data)
    except Exception as e:
        logging.info(e)
        a_list.append(np.nan)


def extract_specific_features(df):
    # Initialize empty lists to store your new features
    aspect_ratio_means = []
    aspect_ratio_std = []
    aspect_ratio_min = []
    aspect_ratio_max = []
    page_counts = []
    persistent_changes_raw_list = []
    persistent_changes_frequency_list = []
    num_changes_list = []
    changes_significance_list = []
    text_density_means = []
    text_density_correlations = []
    text_density_variability_lists = []
    text_density_beginning_lists = []
    text_density_middle_lists = []
    text_density_end_lists = []
    categories_lists = []
    portrait_counts = []
    landscape_counts = []
    square_counts = []
    outliers_counts = []
    unique_cluster_lists = []
    cluster_transitions_lists = []
    cluster_spreads_lists = []
    majority_cluster_proportions = []

    for pdf_path in tqdm(df["fname"], desc="Processing PDFs"):
        try:
            logging.info(f"Processing aspect stuff for {pdf_path}")
            # Run your feature extraction functions
            (
                aspect_ratios,
                page_count,
                persistent_changes_raw,
                persistent_changes_frequency,
                num_changes,
                changes_significance,
                stats,
                categories,
            ) = check_aspect_ratio_and_mix_feature(pdf_path)

            text_densities = calculate_text_density(pdf_path)
            correlation = correlate_text_density_aspect_ratio(
                aspect_ratios, text_densities
            )
            text_density_variability = calculate_text_density_variability(
                text_densities
            )
            (text_density_beginning, text_density_middle, text_density_end) = (
                calculate_text_density_by_position(text_densities)
            )

            outliers = detect_outliers_z_score(aspect_ratios)
            clusters = calculate_clustering_features(aspect_ratios)

        except Exception as e:
            logging.error(f"Error processing {pdf_path}: {e}")

            # For simplicity, let's just use some of the features as examples
        append_data_or_nan(aspect_ratio_means, stats["mean"])
        append_data_or_nan(aspect_ratio_std, stats["std"])
        append_data_or_nan(aspect_ratio_min, stats["min"])
        append_data_or_nan(aspect_ratio_max, stats["max"])
        append_data_or_nan(page_counts, page_count)
        append_data_or_nan(persistent_changes_raw_list, persistent_changes_raw)
        append_data_or_nan(
            persistent_changes_frequency_list, persistent_changes_frequency
        )
        append_data_or_nan(num_changes_list, num_changes)
        append_data_or_nan(changes_significance_list, changes_significance)

        append_data_or_nan(text_density_means, np.mean(text_densities))
        append_data_or_nan(text_density_correlations, correlation)
        append_data_or_nan(text_density_variability_lists, text_density_variability)
        append_data_or_nan(text_density_beginning_lists, text_density_beginning)
        append_data_or_nan(text_density_middle_lists, text_density_middle)
        append_data_or_nan(text_density_end_lists, text_density_end)

        append_data_or_nan(
            categories_lists, categories
        )  # This one is a bit tricky as it's a list. Might aggregate or process further.

        append_data_or_nan(
            outliers_counts, len(outliers)
        )  # Assuming aspect ratios are recalculated within the function
        append_data_or_nan(unique_cluster_lists, clusters["CDC"])
        append_data_or_nan(cluster_transitions_lists, clusters["CTI"])
        append_data_or_nan(cluster_spreads_lists, clusters["CDS"])
        append_data_or_nan(majority_cluster_proportions, clusters["MCP"])

    portrait_counts = [
        cats.count("portrait") if isinstance(cats, list) else 0
        for cats in categories_lists
    ]
    landscape_counts = [
        cats.count("landscape") if isinstance(cats, list) else 0
        for cats in categories_lists
    ]
    square_counts = [
        cats.count("square") if isinstance(cats, list) else 0
        for cats in categories_lists
    ]

    # Now add these lists as columns to your DataFrame
    df["aspect_ratio_means"] = aspect_ratio_means
    df["aspect_ratio_std"] = aspect_ratio_std
    df["aspect_ratio_min"] = aspect_ratio_min
    df["aspect_ratio_max"] = aspect_ratio_max
    df["page_counts"] = page_counts
    df["persistent_changes_raw"] = persistent_changes_raw_list
    df["persistent_changes_frequency"] = persistent_changes_frequency_list
    df["num_changes"] = num_changes_list
    df["changes_significance"] = changes_significance_list
    df["text_density_means"] = text_density_means
    df["text_density_correlations"] = text_density_correlations
    df["text_density_variability"] = text_density_variability_lists
    df["text_density_beginning"] = text_density_beginning_lists
    df["text_density_middle"] = text_density_middle_lists
    df["text_density_end"] = text_density_end_lists
    df["outliers_counts"] = outliers_counts
    df["unique_cluster_lists"] = unique_cluster_lists
    df["cluster_transitions_lists"] = cluster_transitions_lists
    df["cluster_spreads_lists"] = cluster_spreads_lists
    df["majority_cluster_proportions"] = majority_cluster_proportions
    df["portrait_count"] = portrait_counts
    df["landscape_count"] = landscape_counts
    df["square_count"] = square_counts

    return df


keywords = {
    "financial_terms": [
        "financial",
        "investment",
        "share price",
        "financial metrics",
        "investment strategy",
    ],
    "legal_statements": [
        "confidentiality statement",
        "legal disclaimer",
        "disclosure statement",
        "proprietary information",
        "intellectual property",
    ],
    "company_info": [
        "company overview",
        "company analysis",
        "business model",
        "company performance",
    ],
    "presentation_content": [
        "visual aids",
        "data charts",
        "case studies",
        "comparative analysis",
    ],
    "company_targets": ["sales targets", "company targets", "performance targets"],
    "financial_discussions": [
        "financial figures",
        "financial projections",
        "financial results",
        "financial language",
    ],
    "regulatory_references": [
        "SEC filings",
        "regulatory filings",
        "external entities",
        "lawsuits",
    ],
    "detail_descriptions": [
        "loan details",
        "product details",
        "research and development",
        "financial details",
    ],
    "company_specific": [
        "company specific",
        "industry specific",
        "company-specific analysis",
        "specific company focus",
    ],
    # "Other Clusters" category is omitted since it's broad and without specific keywords
}


def load_df_from_pickle(path):
    df = pd.read_pickle(path)
    return df


def load_np_array_from_pickle(path):
    np_array = np.load(path)
    return np_array

def load_vectorizer(path):
    vectorizer = pickle.load(open(path, 'rb'))
    return vectorizer

# Implementation of Feature Extraction

In [12]:
dfpickle_path = "/dave/data/df.pkl"
if not os.path.exists(dfpickle_path):
    print("Go back and do the preprocessing step above before running this cell for feature extraction")
else:
    print("loading PreProcessing df from disk")
    df = load_from_pickle(dfpickle_path)


loading PreProcessing df from disk


In [13]:
dff_pickle_path = "/dave/data/df_features.pkl"
features_path = "/dave/data/features_array.pkl.npy"
features_array_ppath = '/dave/data/features_array.pkl'
tdif_vectorizer_pickle_path = '/dave/data/tdif_vectorizer.pkl'

force = False
if os.path.exists(features_path) and os.path.exists(dff_pickle_path) and os.path.exists(tdif_vectorizer_pickle_path) and not force:
    print("loading dataframe with features,features numpy array, and tdif vectorizer from disk)")
    features = load_np_array_from_pickle(features_path)
    df = load_df_from_pickle(dff_pickle_path)
    tdif_vectorizer = load_vectorizer(tdif_vectorizer_pickle_path)
else:
    df = extract_specific_features(df)
    # Apply keyword matching
    for category, keyword_list in keywords.items():
        df[category + "_keyword"] = df["tokenized_text"].apply(
            check_keywords, args=(keyword_list,)
        )
    features, tdif_vectorizer = combine_tfidf_keyword_additional_features(df)
    df.to_pickle(dff_pickle_path)
    np.save(features_array_ppath, features)
    pickle.dump(tdif_vectorizer, open(tdif_vectorizer_pickle_path, 'wb'))

Processing PDFs:   1%|▏         | 40/3070 [00:04<03:35, 14.06it/s]MuPDF error: format error: No default Layer config
Processing PDFs:   4%|▍         | 126/3070 [00:10<09:32,  5.14it/s]MuPDF error: format error: No default Layer config
Processing PDFs:   4%|▍         | 138/3070 [00:11<03:32, 13.82it/s]MuPDF error: format error: No default Layer config
Processing PDFs:   8%|▊         | 231/3070 [00:15<02:53, 16.35it/s]MuPDF error: format error: No default Layer config
Processing PDFs:  10%|▉         | 299/3070 [00:20<11:45,  3.93it/s]MuPDF error: syntax error: syntax error in array
MuPDF error: syntax error: syntax error in content stream
MuPDF error: syntax error: syntax error in array
MuPDF error: syntax error: syntax error in content stream
MuPDF error: syntax error: syntax error in content stream
MuPDF error: syntax error: syntax error in array
MuPDF error: syntax error: syntax error in content stream
MuPDF error: syntax error: syntax error in content stream
Processing PDFs:  10%|▉  

# Train the model

In [89]:
def split_data(features, labels, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test


def train_model_random_forest(X_train, y_train, X_test, y_test):
    # Initialize the RandomForest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print("Evaluating random forest model...")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return rf_classifier


def train_model_HistGradientBoosting(X_train, y_train, X_test, y_test):
    """
    Trains a HistGradientBoostingClassifier model and evaluates it.

    :param X_train: Training feature matrix
    :param y_train: Training target array
    :param X_test: Test feature matrix
    :param y_test: Test target array
    """

    # Initialize the HistGradientBoostingClassifier
    hgb_classifier = HistGradientBoostingClassifier(random_state=42)

    # Train the model
    hgb_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = hgb_classifier.predict(X_test)

    # Evaluate the model
    print("Evaluating HistGradientBoostingClassifier model...")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return hgb_classifier


def train_catboost(X_train, y_train, X_test, y_test):
    # Initialize the CatBoost Classifier
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        verbose=200,  # It prints the training log every 200 iterations
        random_state=42,
        eval_metric="Accuracy",  # You can change this to other metrics relevant to your task
    )

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Evaluating CatBoostClassifier model...")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return model


def train_xgboost(X_train, y_train, X_test, y_test):
    
    # Adjust labels if they start from 1 instead of 0
    y_train = y_train - 1
    y_test = y_test - 1

    # Convert the datasets to DMatrix, which is a high-performance XGBoost data structure
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set up the parameters for XGBoost
    params = {
        "max_depth": 6,
        "eta": 0.3,
        "objective": "multi:softmax",  # Use softmax for multi-class classification
        "num_class": 3,  # Specify the number of unique classes
        "eval_metric": "mlogloss",  # Multiclass logloss for evaluation
    }
    num_rounds = 100

    # Train the model
    eval_set = [(dtrain, "train"), (dtest, "test")]
    bst = xgb.train(
        params, dtrain, num_rounds, evals=eval_set, early_stopping_rounds=10
    )

    # Make predictions
    y_pred = bst.predict(dtest)

    # Evaluate the model
    print("Evaluating XGBoost model...")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return bst


def store_model(model, path, type):
    if type == "forest" or type == "hgb":
        dump(model, path)
    if type == "catboost":
        model.save_model(path)
    elif type == "xgboost":
        model.save_model(path)
        
def load_model(path, type):
    if type == "forest" or type == "hgb":
        model = load(path)
    elif type == "catboost":
        model = CatBoostClassifier()
        model.load_model(path)
    elif type == "xgboost":
        model = xgb.Booster()
        model.load_model(path)
    return model
        


## Implement Model Training

In [16]:
dff_pickle_path = "/dave/data/df_features.pkl"
features_path = "/dave/data/features_array.pkl.npy"
features = load_np_array_from_pickle(features_path)
df = load_df_from_pickle(dff_pickle_path)

X_train, X_test, y_train, y_test = split_data(features, df["presentation"])
model1 = train_model_random_forest(X_train, y_train, X_test, y_test)
model2 = train_model_HistGradientBoosting(X_train, y_train, X_test, y_test)
model3 = train_catboost(X_train, y_train, X_test, y_test)
model4 = train_xgboost(X_train, y_train, X_test, y_test)


Evaluating random forest model...
Accuracy: 0.9478827361563518

Classification Report:
               precision    recall  f1-score   support

           1       0.95      0.90      0.92       135
           2       0.93      0.99      0.96       345
           3       0.99      0.88      0.93       134

    accuracy                           0.95       614
   macro avg       0.96      0.92      0.94       614
weighted avg       0.95      0.95      0.95       614

Evaluating HistGradientBoostingClassifier model...
Accuracy: 0.9560260586319218

Classification Report:
               precision    recall  f1-score   support

           1       0.95      0.87      0.91       135
           2       0.95      0.99      0.97       345
           3       0.98      0.96      0.97       134

    accuracy                           0.96       614
   macro avg       0.96      0.94      0.95       614
weighted avg       0.96      0.96      0.96       614

0:	learn: 0.8717427	test: 0.8648208	best: 0.8

In [62]:
models = [(model1, "/dave/data/model1", "forest"), 
          (model2, "/dave/data/model2", "hgb"), 
          (model3, "/dave/data/model3", "catboost"), 
          (model4, "/dave/data/model4",  "xgboost")]
for model, path, type in models:
    store_model(model, path, type)




# Process and predict a single document

In [90]:
def prepare_single_document(filepath, tfidf_vectorizer):
    # Step 1: Convert PDF document to text
    document_text = pdf_to_text(filepath)
    
    # Step 2: Clean and tokenize the text
    tokenized_text = clean_and_tokenize(document_text)
    
    # Create a DataFrame to hold the document text
    df = pd.DataFrame({'tokenized_text': [tokenized_text]})
    df['fname'] = filepath
    
    # Step 3: Extract specific features (if this applies directly to the text)
    df = extract_specific_features(df)
    
    # Step 4: Apply keyword matching
    for category, keyword_list in keywords.items():
        df[category + "_keyword"] = df["tokenized_text"].apply(
            lambda x: check_keywords(x, keyword_list)
        )
    
    # Step 5: Combine TF-IDF with keyword features and any additional features
    combined_features, _ = combine_tfidf_keyword_additional_features(df, tfidf_vectorizer)
    
    return combined_features


In [18]:
fn = '/dave/presentations/WBA-3Q-2023-Presentation.pdf'
os.path.exists(fn)


True

In [92]:
import time
# For a single prediction from each model
start = time.time()
doc_feature_array = prepare_single_document(fn, tdif_vectorizer)
doc_feature_array.shape
single_prediction_model1 = model1.predict(doc_feature_array)
print("Random Forest Prediction:", single_prediction_model1)
print("Time taken for single prediction:", time.time() - start)

Processing PDFs: 100%|██████████| 1/1 [00:00<00:00,  7.17it/s]

Random Forest Prediction: [1]
Time taken for single prediction: 3.1576430797576904





In [91]:

start = time.time()
single_prediction_model2 = model2.predict(doc_feature_array)
print("HistGradientBoosting Prediction:", single_prediction_model2)
print("Time taken:", time.time() - start)


HistGradientBoosting Prediction: [1]
Time taken: 0.005476951599121094


In [93]:
start = time.time()
single_prediction_model3 = model3.predict(doc_feature_array)
print("CatBoost Prediction:", single_prediction_model3)
print("Time taken:", time.time() - start)

CatBoost Prediction: [[1]]
Time taken: 0.024025678634643555


In [94]:
start = time.time()
dtest = xgb.DMatrix(doc_feature_array)

single_prediction_model4 = model4.predict(dtest)
print("XGBoost Prediction:", single_prediction_model4)

print("Time taken:", time.time() - start)


XGBoost Prediction: [0.]
Time taken: 0.03545713424682617


# Working on processing links in memory here

In [105]:
def handle_download(initial_files, download_dir, url):
    # Wait for the download to start and finish
    # Adjust the time as needed based on your expected download time
    # time.sleep(2)  # Initial sleep to wait for the download to start

    # Now wait for a new file to appear in the directory
    new_file = None
    timeout = 10  # Max time to wait for a download to finish
    start_time = time.time()

    while True:
        current_files = set(os.listdir(download_dir))
        new_files = current_files - initial_files
        if new_files:
            new_file = new_files.pop()
            break
        elif time.time() - start_time > timeout:
            print(f"Timeout waiting for download to complete for {link}")
            break
        else:
            time.sleep(1)  # Check every second for a new file

    if new_file:
        new_file =  os.path.join(download_dir, new_file)
        doc_feature_array = prepare_single_document(new_file, tdif_vectorizer)
                    
        single_prediction_model2 = model2.predict(doc_feature_array)
        # remove the file from the download directory
        os.remove(os.path.join(download_dir, new_file))
        return  single_prediction_model2
    else:
        logging.info(f"Failed to download {url}")
        return None

def download_file(url, download_path):
    # Set up Firefox profile to handle downloads automatically
    gecko_driver_path = "/snap/bin/geckodriver"

    # Set up Firefox options
    firefox_options = Options()
    firefox_options.add_argument("--headless")
    firefox_options.set_preference("general.useragent.override", getRandomAgent())

    # Create a Firefox Profile
    # firefox_profile = webdriver.FirefoxProfile()
    firefox_options.set_preference("browser.download.folderList", 2)
    firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
    firefox_options.set_preference("browser.download.dir", download_path)
    firefox_options.set_preference("browser.download.useDownloadDir", True)
    firefox_options.set_preference(
        "browser.helperApps.neverAsk.saveToDisk", "application/pdf"
    )
    firefox_options.enable_downloads = True

    firefox_options.set_preference(
        "pdfjs.disabled", True
    )  # Disable Firefox's built-in PDF viewer
    
    # Initialize the driver with Service
    service = Service(executable_path=gecko_driver_path)
    driver = webdriver.Firefox(service=service, options=firefox_options)
    driver.set_page_load_timeout(5)
    #
    # Navigate to URL and initiate download
    initial_files = set(os.listdir(download_path ))
    try:
        driver.get(url)
        handle_download(initial_files,  download_path, url)
        print(f"Downloading {url} to {download_path}???")
    except TimeoutException:
        prediction = handle_download(initial_files, download_path, url)

        driver.quit()
        return prediction
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        driver.quit()
        return ""

    # Close the driver
    driver.quit()


# Open dataset.csv as dataframe
def open_dataset(path):
    df = pd.read_csv(path)
    return df


In [107]:
download_path = "/home/mike/Downloads"
dataset_path = "/uw/invest-data/classify_presentations/data/dataset.csv"
df = open_dataset(dataset_path)
# iterate through the first 100 rows of the dataset
predictions = []
start = time.time()
for i in range(100):
    # print(df.iloc[i]["link"])
    result = download_file(df.iloc[i]["link"], download_path)
    predictions.append([df.iloc[i]['link'], result])
    print(f"{i} / 100 completed type {result}")
print(time.time() - start)


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 28.57it/s]


0 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 46.98it/s]


1 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 60.41it/s]


2 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 109.62it/s]


3 / 100 completed type [1]
Failed to download https://www.sec.gov/files/amac-kitces-presentation-070721.pdf: name 'link' is not defined
4 / 100 completed type 


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 16.35it/s]


5 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 17.88it/s]


6 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00,  5.02it/s]


7 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 53.62it/s]


8 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 20.29it/s]


9 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 37.91it/s]


10 / 100 completed type [2]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 12.56it/s]


11 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 36.33it/s]


12 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 87.61it/s]


13 / 100 completed type [2]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 24.22it/s]


14 / 100 completed type [1]


Processing PDFs: 100%|██████████| 1/1 [00:00<00:00, 107.28it/s]


15 / 100 completed type [1]


Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]


UnboundLocalError: local variable 'clusters' referenced before assignment

In [108]:
predictions

[['https://www.sec.gov/Archives/edgar/data/1345471/000090266423000183/p23-0016_exhibit1.pdf',
  array([1])],
 ['https://www.sec.gov/Archives/edgar/data/1299709/000129970924000011/jan2024deck.pdf',
  array([1])],
 ['https://www.sec.gov/files/hamilton-lane-presentation-092020.pdf',
  array([1])],
 ['https://www.sec.gov/files/amac-emoney-presentation-070721.pdf', array([1])],
 ['https://www.sec.gov/files/amac-kitces-presentation-070721.pdf', ''],
 ['https://www.sec.gov/Archives/edgar/data/939767/000119312523144216/d510591ddefa14a1.pdf',
  array([1])],
 ['https://www.sec.gov/Archives/edgar/data/1299709/000129970922000016/investorfeb22update.pdf',
  array([1])],
 ['https://www.sec.gov/Archives/edgar/data/1372612/000092189521001941/ex1todfan14a06297267_080621.pdf',
  array([1])],
 ['https://www.sec.gov/Archives/edgar/data/1846832/000095017023022881/dlo-ex99_4.pdf',
  array([1])],
 ['https://www.sec.gov/Archives/edgar/data/876883/000119380521000825/ex992to13da110123011_060921.pdf',
  array([1