# Data preparation of classification using additional covariates

This file takes the output of btm_approach/00-estc_btm_prep.ipynb for further processing.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import dropbox
import io
import torch
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


## Load processed data

In [14]:
# Load processed data
filtered_data = pd.read_csv("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/estc_btm_prepped.csv", encoding = "ISO-8859-1")
filtered_data = filtered_data.sort_values("system_number")

filtered_data.head()

Unnamed: 0,system_number,date,pages,format,title,clean_title
60166,6006897,1678.0,156.0,12mo,"Art's treasury of rareties, and curious invent...",art treasuri rareti curiou invent two part
82409,6013641,1692.0,1.0,half_sheet,Reasons humbly offered against the passing of ...,reason humbl offer pass bill sole use convexli...
91628,6013665,1698.0,1.0,broadside,Reasons humbly offered for passing the bill fo...,reason humbl offer pass bill rendr law effectu...
88500,6014314,1696.0,1.0,half_sheet,A representation of great evils arising by the...,represent great evil aris export wooll humbl o...
93218,6015102,1699.0,4.0,folio,"Some considerations, humbly offered to the Hon...",consider humbl offer honour hous common concer...


## Embedding text

In [15]:
# Model choice for embeddings
# Models can be found here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
# Better models exist, but take much longer to encode
# This one still takes ~4hours
# all-mpnet-base-v2 takes ~32hours
model = SentenceTransformer("all-MiniLM-L6-v2")

# Allow model to use unlimited memory
torch.mps.set_per_process_memory_fraction(0.0)

# Create embeddings
# Embeddings should be on original texts
embeddings_title = model.encode(filtered_data.title,
                                output_value = "sentence_embedding",
                                show_progress_bar = True,
                                batch_size=32)

Batches:   0%|          | 0/2960 [00:00<?, ?it/s]

In [16]:
# I'll be repeating these to track how the vector is forming
embeddings_title[0]

array([-9.01262276e-03, -2.54522115e-02,  6.06522523e-02, -7.74590075e-02,
       -5.29957935e-02,  3.65858525e-02,  5.13880737e-02, -1.06075309e-01,
       -5.72221205e-02,  8.24946761e-02,  1.44217731e-02, -1.92364268e-02,
        1.65596213e-02, -3.33328508e-02, -2.10373960e-02,  1.08000852e-01,
       -3.19029577e-02, -9.35262069e-03, -5.07031344e-02,  5.33827879e-02,
        9.03201755e-03,  7.62023078e-03, -2.91096177e-02, -4.67273314e-03,
       -4.50176740e-04,  1.58876495e-03,  2.35229116e-02,  2.24165954e-02,
        4.69742939e-02,  2.82749552e-02,  2.19025780e-02, -3.97498384e-02,
        3.69866169e-03,  1.28560849e-02, -5.59397787e-02,  3.32304202e-02,
        1.15490220e-01, -4.08037566e-02,  5.10789491e-02,  9.18175280e-03,
       -5.52255958e-02,  4.00929786e-02, -2.17728261e-02, -1.14935655e-02,
       -7.24142492e-02, -2.38872599e-02,  2.10224353e-02,  4.39102277e-02,
       -1.07716238e-02,  3.14554875e-03, -1.30925635e-02, -6.57068286e-03,
       -4.62571979e-02, -

## Prepare vectors for other covariates

In [17]:
# Add log pages
covariate_features = np.log(filtered_data["pages"])

covariate_features.iloc[0]

5.049856007249537

In [18]:
# Create feature for format 
format_feature = filtered_data[["system_number", "format"]]
format_feature["x"] = 1
format_feature = format_feature.pivot(index = "system_number",
                                      columns = "format")
format_feature = format_feature.fillna(0)

format_feature.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  format_feature["x"] = 1


Unnamed: 0_level_0,x,x,x,x,x,x,x,x,x,x,x,x,x,x
format,10mo,12mo,16mo,24mo,32mo,broadside,eighth_sheet,folio,half_sheet,octavo,quarter_sheet,quarto,sixes,unknown
system_number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
6006897,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6013641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6013665,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6014314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6015102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# add feature
format_feature = np.array(format_feature)
covariate_features = np.column_stack((covariate_features, format_feature))
covariate_features[0]

array([5.04985601, 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

## Weight features and combine

### No embedding dim reduction 

In [21]:
features_full = np.column_stack((embeddings_title, covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_full", features_full)

features_full[0]

array([-9.01262276e-03, -2.54522115e-02,  6.06522523e-02, -7.74590075e-02,
       -5.29957935e-02,  3.65858525e-02,  5.13880737e-02, -1.06075309e-01,
       -5.72221205e-02,  8.24946761e-02,  1.44217731e-02, -1.92364268e-02,
        1.65596213e-02, -3.33328508e-02, -2.10373960e-02,  1.08000852e-01,
       -3.19029577e-02, -9.35262069e-03, -5.07031344e-02,  5.33827879e-02,
        9.03201755e-03,  7.62023078e-03, -2.91096177e-02, -4.67273314e-03,
       -4.50176740e-04,  1.58876495e-03,  2.35229116e-02,  2.24165954e-02,
        4.69742939e-02,  2.82749552e-02,  2.19025780e-02, -3.97498384e-02,
        3.69866169e-03,  1.28560849e-02, -5.59397787e-02,  3.32304202e-02,
        1.15490220e-01, -4.08037566e-02,  5.10789491e-02,  9.18175280e-03,
       -5.52255958e-02,  4.00929786e-02, -2.17728261e-02, -1.14935655e-02,
       -7.24142492e-02, -2.38872599e-02,  2.10224353e-02,  4.39102277e-02,
       -1.07716238e-02,  3.14554875e-03, -1.30925635e-02, -6.57068286e-03,
       -4.62571979e-02, -

### Reduce dims
Run PCA to lower dims and then save

In [22]:
features_100 = np.column_stack((PCA(100).fit_transform(embeddings_title), covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_100", features_100)

features_100[0]

array([-2.78507322e-01,  1.40984640e-01, -2.86638290e-02, -1.53006539e-01,
        8.44496489e-03,  5.31298742e-02,  1.11001939e-01, -9.51588824e-02,
       -7.92620927e-02, -1.21324241e-01,  5.63577898e-02,  1.52155504e-01,
        8.34959671e-02,  1.06895506e-01,  3.31639647e-02,  4.07431573e-02,
       -1.06603488e-01, -1.50761217e-01, -1.83301896e-01, -1.14719242e-01,
       -1.59025416e-02, -8.27423930e-02, -2.60801278e-02,  1.28782541e-03,
        4.16102409e-02,  9.53634605e-02,  1.82779543e-02,  1.44507065e-02,
        4.93258163e-02,  5.90623878e-02,  2.77873874e-03,  7.75520653e-02,
        1.20484196e-01,  4.86522587e-03,  5.91818541e-02, -8.35765153e-02,
       -7.66809657e-02,  4.14965525e-02,  1.95530709e-02, -1.54720992e-02,
        4.93888482e-02, -7.45728612e-02,  7.98505694e-02, -4.63380106e-03,
       -8.11053533e-03,  1.00478582e-01, -2.45004352e-02, -8.42556283e-02,
       -4.46604080e-02, -2.85337996e-02,  1.06824152e-02,  6.79470971e-02,
        2.42191181e-03, -

In [23]:
features_75 = np.column_stack((PCA(75).fit_transform(embeddings_title), covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_75", features_75)


features_75[0]

array([-2.78507322e-01,  1.40984625e-01, -2.86638290e-02, -1.53006524e-01,
        8.44496489e-03,  5.31298816e-02,  1.11001946e-01, -9.51588824e-02,
       -7.92620853e-02, -1.21324226e-01,  5.63577749e-02,  1.52155474e-01,
        8.34959745e-02,  1.06895447e-01,  3.31639685e-02,  4.07431684e-02,
       -1.06603540e-01, -1.50761202e-01, -1.83301896e-01, -1.14719190e-01,
       -1.59025714e-02, -8.27423632e-02, -2.60801427e-02,  1.28784031e-03,
        4.16102484e-02,  9.53634605e-02,  1.82779524e-02,  1.44507065e-02,
        4.93258163e-02,  5.90623878e-02,  2.77875364e-03,  7.75520578e-02,
        1.20484196e-01,  4.86523705e-03,  5.91818467e-02, -8.35765153e-02,
       -7.66809583e-02,  4.14965674e-02,  1.95530597e-02, -1.54720955e-02,
        4.93888296e-02, -7.45728835e-02,  7.98505470e-02, -4.63379920e-03,
       -8.11055116e-03,  1.00478642e-01, -2.45004352e-02, -8.42556283e-02,
       -4.46603969e-02, -2.85337977e-02,  1.06824152e-02,  6.79470971e-02,
        2.42191181e-03, -

In [24]:
features_50 = np.column_stack((PCA(50).fit_transform(embeddings_title), covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_50", features_50)

features_50[0]

array([-2.78507262e-01,  1.40984669e-01, -2.86637917e-02, -1.53006583e-01,
        8.44496489e-03,  5.31298965e-02,  1.11001864e-01, -9.51588973e-02,
       -7.92620927e-02, -1.21324249e-01,  5.63577898e-02,  1.52155504e-01,
        8.34959596e-02,  1.06895491e-01,  3.31639610e-02,  4.07431573e-02,
       -1.06603473e-01, -1.50761202e-01, -1.83301896e-01, -1.14719242e-01,
       -1.59025490e-02, -8.27423930e-02, -2.60801241e-02,  1.28782541e-03,
        4.16102484e-02,  9.53634679e-02,  1.82779450e-02,  1.44507214e-02,
        4.93258052e-02,  5.90623990e-02,  2.77876109e-03,  7.75520504e-02,
        1.20484166e-01,  4.86522634e-03,  5.91818541e-02, -8.35765079e-02,
       -7.66809657e-02,  4.14965525e-02,  1.95530709e-02, -1.54720992e-02,
        4.93888520e-02, -7.45728612e-02,  7.98505694e-02, -4.63380106e-03,
       -8.11053161e-03,  1.00478575e-01, -2.45004352e-02, -8.42556357e-02,
       -4.46604118e-02, -2.85337958e-02,  5.04985601e+00,  0.00000000e+00,
        1.00000000e+00,  

In [25]:
features_25 = np.column_stack((PCA(25).fit_transform(embeddings_title), covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_25", features_25)

features_25[0]

array([-2.78507262e-01,  1.40984669e-01, -2.86637917e-02, -1.53006583e-01,
        8.44486058e-03,  5.31299189e-02,  1.11001879e-01, -9.51588973e-02,
       -7.92620927e-02, -1.21324249e-01,  5.63577898e-02,  1.52155429e-01,
        8.34959596e-02,  1.06895491e-01,  3.31639610e-02,  4.07431573e-02,
       -1.06603473e-01, -1.50761202e-01, -1.83301896e-01, -1.14719242e-01,
       -1.59025490e-02, -8.27423930e-02, -2.60801241e-02,  1.28784776e-03,
        4.16102409e-02,  5.04985601e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [26]:
features_10 = np.column_stack((PCA(10).fit_transform(embeddings_title), covariate_features))
np.save("/Users/jamiesanders/Dropbox/ClassifyingESTC/intermediate_output/features_10", features_10)

features_10[0]

array([-0.27850726,  0.14098467, -0.02866379, -0.15300658,  0.00844496,
        0.0531299 ,  0.11100186, -0.0951589 , -0.0792621 , -0.12132425,
        5.04985601,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])