In [3]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer
import numpy as np

# Step 1: Load the CSV files
file_path_train = 'data/filtered_books_fina1.csv'
file_path_test = 'data/filtered_books_final_test.csv'
df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

df_test


Unnamed: 0,book_id,title,authors,description,similar_book
0,13500133,Legend Phoenix : Rise Crescent,3954554,"thousands years , legendary clan known Phoenix...",13512962
1,18869972,House Strand,2001717,Dick Young lent house Cornwall friend Professo...,1293687
2,18074420,Mes amis,667170,Le romancier donne voir la complexite des sent...,545437
3,3556999,Black Elk Speaks,20072,"famous life story Lakota healer visionary , Ni...",844522
4,15713961,"Lessons Survivors ( Cambridge Fellows , # 9 )",2727135,"Cambridge , September 1919 Orlando Coppersmith...",18517813
...,...,...,...,...,...
6030,26114135,Ugly Wonderful Things,4183574,New York Times Bestseller \n Goodreads Runner...,23430487
6031,17166550,Dressmaker,5052587,"Torn dreams truth , faced impossible choice .....",13064827
6032,18045695,"Emma , Mr. Knightley , Chili - Slaw Dogs ( Jan...",6888370,Caroline Ashley journalist rise Washington Pos...,25426363
6033,9286210,Spinner,1893900,"Verlagstext \n "" Ich habe keine Angst vor der ...",15990986


In [5]:
# Step 2: Check for null values and remove rows with missing values if necessary
print("Missing values in train set:", df_train.isnull().sum())
print("Missing values in test set:", df_test.isnull().sum())

df_train.dropna(subset=['description', 'title', 'similar_book'], inplace=True)
df_test.dropna(subset=['description', 'title', 'similar_book'], inplace=True)

df_test

Missing values in train set: book_id         0
title           0
authors         0
description     0
similar_book    0
dtype: int64
Missing values in test set: book_id         0
title           0
authors         0
description     0
similar_book    0
dtype: int64


Unnamed: 0,book_id,title,authors,description,similar_book
0,13500133,Legend Phoenix : Rise Crescent,3954554,"thousands years , legendary clan known Phoenix...",13512962
1,18869972,House Strand,2001717,Dick Young lent house Cornwall friend Professo...,1293687
2,18074420,Mes amis,667170,Le romancier donne voir la complexite des sent...,545437
3,3556999,Black Elk Speaks,20072,"famous life story Lakota healer visionary , Ni...",844522
4,15713961,"Lessons Survivors ( Cambridge Fellows , # 9 )",2727135,"Cambridge , September 1919 Orlando Coppersmith...",18517813
...,...,...,...,...,...
6030,26114135,Ugly Wonderful Things,4183574,New York Times Bestseller \n Goodreads Runner...,23430487
6031,17166550,Dressmaker,5052587,"Torn dreams truth , faced impossible choice .....",13064827
6032,18045695,"Emma , Mr. Knightley , Chili - Slaw Dogs ( Jan...",6888370,Caroline Ashley journalist rise Washington Pos...,25426363
6033,9286210,Spinner,1893900,"Verlagstext \n "" Ich habe keine Angst vor der ...",15990986


In [6]:
# Step 3: Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize descriptions and titles
df_train['description_tokens'] = df_train['description'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))
df_train['title_tokens'] = df_train['title'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

df_test['description_tokens'] = df_test['description'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512)) #ändern ?
df_test['title_tokens'] = df_test['title'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=512))

df_test

Unnamed: 0,book_id,title,authors,description,similar_book,description_tokens,title_tokens
0,13500133,Legend Phoenix : Rise Crescent,3954554,"thousands years , legendary clan known Phoenix...",13512962,"[101, 5190, 2086, 1010, 8987, 6338, 2124, 6708...","[101, 5722, 6708, 1024, 4125, 13152, 102, 0, 0..."
1,18869972,House Strand,2001717,Dick Young lent house Cornwall friend Professo...,1293687,"[101, 5980, 2402, 15307, 2160, 10387, 2767, 29...","[101, 2160, 11226, 102, 0, 0, 0, 0, 0, 0, 0, 0..."
2,18074420,Mes amis,667170,Le romancier donne voir la complexite des sent...,545437,"[101, 3393, 3142, 19562, 2123, 2638, 29536, 43...","[101, 2033, 2015, 26445, 2015, 102, 0, 0, 0, 0..."
3,3556999,Black Elk Speaks,20072,"famous life story Lakota healer visionary , Ni...",844522,"[101, 3297, 2166, 2466, 2474, 27380, 19783, 28...","[101, 2304, 18995, 8847, 102, 0, 0, 0, 0, 0, 0..."
4,15713961,"Lessons Survivors ( Cambridge Fellows , # 9 )",2727135,"Cambridge , September 1919 Orlando Coppersmith...",18517813,"[101, 4729, 1010, 2244, 4529, 10108, 6967, 214...","[101, 8220, 8643, 1006, 4729, 13572, 1010, 100..."
...,...,...,...,...,...,...,...
6030,26114135,Ugly Wonderful Things,4183574,New York Times Bestseller \n Goodreads Runner...,23430487,"[101, 2047, 2259, 2335, 24304, 2204, 16416, 51...","[101, 9200, 6919, 2477, 102, 0, 0, 0, 0, 0, 0,..."
6031,17166550,Dressmaker,5052587,"Torn dreams truth , faced impossible choice .....",13064827,"[101, 7950, 5544, 3606, 1010, 4320, 5263, 3601...","[101, 4377, 8571, 102, 0, 0, 0, 0, 0, 0, 0, 0,..."
6032,18045695,"Emma , Mr. Knightley , Chili - Slaw Dogs ( Jan...",6888370,Caroline Ashley journalist rise Washington Pos...,25426363,"[101, 7981, 9321, 4988, 4125, 2899, 2695, 5573...","[101, 5616, 1010, 2720, 1012, 5000, 3051, 1010..."
6033,9286210,Spinner,1893900,"Verlagstext \n "" Ich habe keine Angst vor der ...",15990986,"[101, 14552, 13473, 18413, 1000, 22564, 5292, ...","[101, 6714, 3678, 102, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [16]:
# Step 4: Extract the target variable (similar_books)
y_train = df_train['similar_book'].values  # Target variable for training
y_test = df_test['similar_book'].values    # Target variable for testing

y_test.shape

(6035,)

In [12]:
# Step 5: Create input variables (X) for training data
X_train_description = torch.tensor(df_train['description_tokens'].tolist())
X_train_title = torch.tensor(df_train['title_tokens'].tolist())
X_train_book_id = torch.tensor(df_train['book_id'].values)

# Create input variables (X) for test data
X_test_description = torch.tensor(df_test['description_tokens'].tolist())
X_test_title = torch.tensor(df_test['title_tokens'].tolist())
X_test_book_id = torch.tensor(df_test['book_id'].values)

X_test_description.shape, X_test_title, X_test_book_id

(torch.Size([6035, 512]),
 tensor([[  101,  5722,  6708,  ...,     0,     0,     0],
         [  101,  2160, 11226,  ...,     0,     0,     0],
         [  101,  2033,  2015,  ...,     0,     0,     0],
         ...,
         [  101,  5616,  1010,  ...,     0,     0,     0],
         [  101,  6714,  3678,  ...,     0,     0,     0],
         [  101,  3118,   102,  ...,     0,     0,     0]]),
 tensor([13500133, 18869972, 18074420,  ..., 18045695,  9286210,   836042]))

In [13]:
# Step 6: Reshape X_author and X_book_id tensors
X_train_book_id = X_train_book_id.unsqueeze(1)
X_test_book_id = X_test_book_id.unsqueeze(1)
X_test_book_id

tensor([[13500133],
        [18869972],
        [18074420],
        ...,
        [18045695],
        [ 9286210],
        [  836042]])

In [15]:

# Step 7: Combine the input variables into a single tensor for training and test data
X_train_combined = torch.cat((X_train_description, X_train_title, X_train_book_id), dim=1)
X_test_combined = torch.cat((X_test_description, X_test_title, X_test_book_id), dim=1)

X_test_combined.shape

torch.Size([6035, 1025])

In [None]:
# Step 8: Save the tokenized data as NumPy arrays
np.savez('data/tokenized_books.npz', 
         X_train=X_train_combined.numpy(), 
         X_test=X_test_combined.numpy(), 
         y_train=y_train, 
         y_test=y_test)

# Output the dimensions of the datasets
print(f'Train size: {len(y_train)}, Test size: {len(y_test)}')