# Data preparation of classification using additional covariates

This file takes the output of btm_approach/00-estc_btm_prep.ipynb for further processing.

In [41]:
# Import libraries
import pandas as pd
import numpy as np
import dropbox
import io
import torch
from sentence_transformers import SentenceTransformer

# Connect to dropbox for raw data DO NOT COMMIT TOKEN
access_token = 'XXXX'
dbx = dropbox.Dropbox(access_token)

## Load processed data

In [76]:
# Load processed data
md, response = dbx.files_download("/ClassifyingESTC/intermediate_output/estc_btm_prepped.csv")
filtered_data = pd.read_csv(io.BytesIO(response.content), encoding = "ISO-8859-1")
filtered_data = filtered_data.sort_values("system_number")

filtered_data.head()

Unnamed: 0,system_number,date,pages,format,title,clean_title
60166,6006897,1678.0,156.0,12mo,"Art's treasury of rareties, and curious invent...",art treasuri rareti curiou invent two part
82409,6013641,1692.0,1.0,half_sheet,Reasons humbly offered against the passing of ...,reason humbl offer pass bill sole use convexli...
91628,6013665,1698.0,1.0,broadside,Reasons humbly offered for passing the bill fo...,reason humbl offer pass bill rendr law effectu...
88500,6014314,1696.0,1.0,half_sheet,A representation of great evils arising by the...,represent great evil aris export wooll humbl o...
93218,6015102,1699.0,4.0,folio,"Some considerations, humbly offered to the Hon...",consider humbl offer honour hous common concer...


## Embedding text

In [3]:
# Model choice for embeddings
# Models can be found here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
# Better models exist, but take much longer to encode
# This one still takes ~4hours
# all-mpnet-base-v2 takes ~32hours
model = SentenceTransformer("all-MiniLM-L6-v2")

# Scary, allow model to use unlimited memory
torch.mps.set_per_process_memory_fraction(0.0)

# Create embeddings
embeddings_title = model.encode(filtered_data.title,
                                output_value = "sentence_embedding",
                                show_progress_bar = True,
                                batch_size=32)

embeddings_clean_title = model.encode(filtered_data.clean_title,
                                      output_value = "sentence_embedding",
                                      show_progress_bar = True,
                                      batch_size=32)

Batches:   0%|          | 0/2960 [00:00<?, ?it/s]

Batches:   0%|          | 0/2960 [00:00<?, ?it/s]

In [11]:
# I'll be repeating these to track how the vector is forming
embeddings_title[0]

array([-9.01262276e-03, -2.54522115e-02,  6.06522523e-02, -7.74590075e-02,
       -5.29957935e-02,  3.65858525e-02,  5.13880737e-02, -1.06075309e-01,
       -5.72221205e-02,  8.24946761e-02,  1.44217731e-02, -1.92364268e-02,
        1.65596213e-02, -3.33328508e-02, -2.10373960e-02,  1.08000852e-01,
       -3.19029577e-02, -9.35262069e-03, -5.07031344e-02,  5.33827879e-02,
        9.03201755e-03,  7.62023078e-03, -2.91096177e-02, -4.67273314e-03,
       -4.50176740e-04,  1.58876495e-03,  2.35229116e-02,  2.24165954e-02,
        4.69742939e-02,  2.82749552e-02,  2.19025780e-02, -3.97498384e-02,
        3.69866169e-03,  1.28560849e-02, -5.59397787e-02,  3.32304202e-02,
        1.15490220e-01, -4.08037566e-02,  5.10789491e-02,  9.18175280e-03,
       -5.52255958e-02,  4.00929786e-02, -2.17728261e-02, -1.14935655e-02,
       -7.24142492e-02, -2.38872599e-02,  2.10224353e-02,  4.39102277e-02,
       -1.07716238e-02,  3.14554875e-03, -1.30925635e-02, -6.57068286e-03,
       -4.62571979e-02, -

## Other preparation 

In [93]:
# Add log pages
pages_feature = np.log(filtered_data["pages"])
features = np.column_stack((embeddings_title, pages_feature))

features[1]

array([-1.89171210e-02,  1.87031757e-02, -1.45315472e-02, -3.07348892e-02,
        3.29026743e-03,  8.43884796e-02,  7.76196644e-03,  1.62816551e-02,
       -4.00291681e-02,  6.43679574e-02, -4.19794768e-03,  3.16341780e-02,
        2.80838907e-02, -5.78374602e-02, -1.68797821e-02,  2.19400506e-02,
       -7.60562271e-02,  3.40041555e-02, -6.86389906e-03, -5.96276969e-02,
       -1.16820373e-02, -6.38233125e-03,  9.57775582e-03,  4.46064845e-02,
       -8.63676593e-02, -7.76008815e-02,  2.22403090e-02, -7.91972205e-02,
       -6.68545291e-02,  4.66312142e-03, -2.62684803e-02, -2.68599554e-03,
        5.26589900e-02,  3.67099345e-02,  1.34861860e-02,  4.54639010e-02,
        1.10126004e-01,  6.50118813e-02,  6.41018599e-02,  5.45842871e-02,
        1.51808253e-02, -8.96672681e-02,  1.31717902e-02, -9.81934019e-04,
        5.03027719e-03,  5.44506609e-02, -2.76488662e-02, -6.01341873e-02,
       -1.19349090e-02,  3.94300604e-03, -5.14975637e-02,  1.23804035e-02,
       -4.30750176e-02, -

In [97]:
# Create feature for format 
format_feature = filtered_data[["system_number", "format"]]
format_feature["x"] = 1
format_feature = format_feature.pivot(index = "system_number",
                                      columns = "format")
format_feature = format_feature.fillna(0)

format_feature.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  format_feature["x"] = 1


Unnamed: 0_level_0,x,x,x,x,x,x,x,x,x,x,x,x,x,x
format,10mo,12mo,16mo,24mo,32mo,broadside,eighth_sheet,folio,half_sheet,octavo,quarter_sheet,quarto,sixes,unknown
system_number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
6006897,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6013641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6013665,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6014314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6015102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
# Add to vector
format_feature = np.array(format_feature)
features = np.column_stack((features, format_feature))
features[0]

array([-9.01262276e-03, -2.54522115e-02,  6.06522523e-02, -7.74590075e-02,
       -5.29957935e-02,  3.65858525e-02,  5.13880737e-02, -1.06075309e-01,
       -5.72221205e-02,  8.24946761e-02,  1.44217731e-02, -1.92364268e-02,
        1.65596213e-02, -3.33328508e-02, -2.10373960e-02,  1.08000852e-01,
       -3.19029577e-02, -9.35262069e-03, -5.07031344e-02,  5.33827879e-02,
        9.03201755e-03,  7.62023078e-03, -2.91096177e-02, -4.67273314e-03,
       -4.50176740e-04,  1.58876495e-03,  2.35229116e-02,  2.24165954e-02,
        4.69742939e-02,  2.82749552e-02,  2.19025780e-02, -3.97498384e-02,
        3.69866169e-03,  1.28560849e-02, -5.59397787e-02,  3.32304202e-02,
        1.15490220e-01, -4.08037566e-02,  5.10789491e-02,  9.18175280e-03,
       -5.52255958e-02,  4.00929786e-02, -2.17728261e-02, -1.14935655e-02,
       -7.24142492e-02, -2.38872599e-02,  2.10224353e-02,  4.39102277e-02,
       -1.07716238e-02,  3.14554875e-03, -1.30925635e-02, -6.57068286e-03,
       -4.62571979e-02, -