In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("data.csv")
print(df.isna().sum())

df.head(7)

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64


Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
5,9780006280934,6280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0
6,9780006353287,6353282,An Autobiography,,Agatha Christie,"Authors, English",http://books.google.com/books/content?id=c49GQ...,Donation.,1977.0,4.27,560.0,3975.0


In [3]:
text_features = [
    "title",
    "authors",
    "categories",
    "description",
]

numeric_features = [
    "published_year",
    "num_pages",
    "average_rating",
]


df = df[text_features + numeric_features].copy()


df[text_features] = df[text_features].fillna("")
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])


df[numeric_features] = df[numeric_features].astype("str")


df["text_corpus"] = df[numeric_features + text_features].apply(
    lambda x: " ".join(x), axis=1
)

# Drop the original columns
df = df.drop(text_features + numeric_features, axis=1)

df.head(7)

Unnamed: 0,text_corpus
0,0.5124251800029966 -0.4188086279152424 -0.2521...
1,0.1307045368125744 -0.44364383544010716 -0.312...
2,-1.5870383575443254 0.5414860630461928 0.11116...
3,-0.5373065887706644 0.6780797044329486 -0.0099...
4,0.32156485840778554 -0.7375271244843394 0.6561...
5,0.32156485840778554 -0.7126919169594748 0.4744...
6,-2.064189161532353 0.8767613646318663 1.019489...


In [4]:
vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the data
feature_fectors = vectorizer.fit_transform(df["text_corpus"])
print(feature_fectors)

  (0, 2561)	0.07165313895405617
  (0, 9019)	0.09507852176464482
  (0, 27911)	0.047753814331858775
  (0, 29922)	0.06000406232782798
  (0, 23381)	0.05848416546754737
  (0, 23789)	0.08594320809182328
  (0, 28976)	0.07276208880589467
  (0, 12390)	0.07752566758702129
  (0, 5127)	0.08180070465004997
  (0, 27013)	0.07877864971158875
  (0, 30571)	0.05008967403036792
  (0, 14988)	0.05182621656502792
  (0, 22245)	0.060257224169904276
  (0, 34061)	0.0319382429213672
  (0, 34092)	0.07276208880589467
  (0, 1958)	0.07877864971158875
  (0, 6435)	0.0739922437131871
  (0, 28963)	0.07032628621809751
  (0, 9601)	0.06662561304724286
  (0, 28746)	0.09507852176464482
  (0, 31883)	0.05568956733349351
  (0, 15785)	0.07201079527516446
  (0, 4392)	0.05881715211037998
  (0, 12681)	0.04779608540536186
  (0, 33178)	0.06575862938168674
  :	:
  (6808, 33199)	0.04594584439270172
  (6808, 16160)	0.05217860003279785
  (6808, 27613)	0.0711117768099556
  (6809, 14335)	0.3105793041341009
  (6809, 1684)	0.25943205897726807

In [5]:
features_similarity = cosine_similarity(feature_fectors)
print(features_similarity)

[[1.         0.00175973 0.0010948  ... 0.00674933 0.00507073 0.        ]
 [0.00175973 1.         0.0111502  ... 0.         0.00308058 0.02610709]
 [0.0010948  0.0111502  1.         ... 0.         0.00622864 0.        ]
 ...
 [0.00674933 0.         0.         ... 1.         0.         0.        ]
 [0.00507073 0.00308058 0.00622864 ... 0.         1.         0.        ]
 [0.         0.02610709 0.         ... 0.         0.         1.        ]]
