In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("Psychology Classic Books.xlsx", sheet_name= 1,  dtype={'Title': str,'Original Publication Year': object,'Author': str,'Summary': str,'Keywords': str,'Additional Keywords': str,'Average Rating': float, 'Number of Pages': float, 'Publisher': str, 'ISBN': str, 'ISBN13' : str, 'Themes' : str})

In [3]:
df.head()

Unnamed: 0,Title,Original Publication Year,Author,Summary,Keywords,Additional Keywords,Average Rating,Number of Pages,Publisher,ISBN,ISBN13,Themes
0,Understanding Human Nature,1927,Alfred Adler,Originally published in 1927 this book attempt...,"inferiority complex, character","development, goal",3.96,224.0,Fawcett,449308332,9780449308332,developmental
1,The Nature of Prejudice,1954,Gordon W. Allport,With profound insight into the complexities of...,"prejudice, race, discrimination","violence, rationality",4.24,575.0,Basic Books,201001799,9780201001792,social
2,Self-Efficacy: The Exercise of Control,1997,Albert Bandura,With over 20 years of research by renowned psy...,"self-system, self-esteem, self-efficacy","goal, self",4.12,604.0,Worth Publishers,716728508,9780716728504,behavioural
3,The Gift of Fear: Survival Signals That Protec...,1997,Gavin de Becker,"A date won't take ""no"" for an answer. The new ...",fear,"violence, intuition, crime",4.18,352.0,Bloomsbury Publishing PLC,747538352,9780747538356,social
4,Games People Play,1964,Eric Berne,"Forty years ago, Games People Play revolutioni...","stroke, transactional analysis, games",social,3.74,192.0,Penguin,345032799,9780345032799,social


In [4]:
#Convert number of pages to int.
#Use Int64 for NaN support: https://stackoverflow.com/a/70548802
import math
df["Number of Pages"] = df["Number of Pages"].astype('Int64')

In [5]:
df.dtypes

Title                         object
Original Publication Year     object
Author                        object
Summary                       object
Keywords                      object
Additional Keywords           object
Average Rating               float64
Number of Pages                Int64
Publisher                     object
ISBN                          object
ISBN13                        object
Themes                        object
dtype: object

Transforming the additional keywords column into a connection.

Create a function that appplies to each row of the dataframe. For the additional keywords column, if other rows also contains the keywords then increase the strength of connection


In [6]:
#Split the additional keywords into a list. Make sure each element is lowercase
df["Additional Keywords List"] = df["Additional Keywords"].str.split(",").apply(lambda x: [s.lower().strip() for s in x])
df.head()

Unnamed: 0,Title,Original Publication Year,Author,Summary,Keywords,Additional Keywords,Average Rating,Number of Pages,Publisher,ISBN,ISBN13,Themes,Additional Keywords List
0,Understanding Human Nature,1927,Alfred Adler,Originally published in 1927 this book attempt...,"inferiority complex, character","development, goal",3.96,224,Fawcett,449308332,9780449308332,developmental,"[development, goal]"
1,The Nature of Prejudice,1954,Gordon W. Allport,With profound insight into the complexities of...,"prejudice, race, discrimination","violence, rationality",4.24,575,Basic Books,201001799,9780201001792,social,"[violence, rationality]"
2,Self-Efficacy: The Exercise of Control,1997,Albert Bandura,With over 20 years of research by renowned psy...,"self-system, self-esteem, self-efficacy","goal, self",4.12,604,Worth Publishers,716728508,9780716728504,behavioural,"[goal, self]"
3,The Gift of Fear: Survival Signals That Protec...,1997,Gavin de Becker,"A date won't take ""no"" for an answer. The new ...",fear,"violence, intuition, crime",4.18,352,Bloomsbury Publishing PLC,747538352,9780747538356,social,"[violence, intuition, crime]"
4,Games People Play,1964,Eric Berne,"Forty years ago, Games People Play revolutioni...","stroke, transactional analysis, games",social,3.74,192,Penguin,345032799,9780345032799,social,[social]


In [7]:
#Get the unique values in additional keywords list

setKeywords = set()

#Looping through the array and adding new keywords not seen.
for idx, row in df.iterrows():
    lst = row["Additional Keywords List"]
    for keyword in lst:
        if not keyword in setKeywords:
            setKeywords.add(keyword)

listKeywords = list(setKeywords)
listKeywords.sort()
print(listKeywords)


['biology', 'compliance', 'crime', 'determinism', 'development', 'disorder', 'free will', 'goal', 'happiness', 'intelligence', 'intuition', 'marriage', 'nature', 'neuroticism', 'personality', 'prejudice', 'psychoanalysis', 'rationality', 'repression', 'self', 'social', 'success', 'therapy', 'thought', 'thoughts', 'violence']


In [8]:
#Count the occurence of keywords for each row
#Outer list is the size of the number of observations
occurrence = [False] * df.shape[0]

for idx, row in df.iterrows():
    lst = row["Additional Keywords List"]

    newOccurrenceRow = [False] * len(listKeywords)
    #Inner list is the size of the keywords. 
    for keyIdx, keyword in enumerate(listKeywords):
        if keyword in lst:
            newOccurrenceRow[keyIdx] = True
    
    occurrence[idx] = newOccurrenceRow
print(listKeywords)
# for i in occurrence:
#     print(i)






['biology', 'compliance', 'crime', 'determinism', 'development', 'disorder', 'free will', 'goal', 'happiness', 'intelligence', 'intuition', 'marriage', 'nature', 'neuroticism', 'personality', 'prejudice', 'psychoanalysis', 'rationality', 'repression', 'self', 'social', 'success', 'therapy', 'thought', 'thoughts', 'violence']
[False, False, False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, True]
[False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False]
[False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False

In [9]:
#This option will create wasterd space since the matrix will be symmetrical on the diagonal. 
#Outer array to contain the strength of connection between different rows. 
# strengthMatrix = [None] * df.shape[0]


# for idx, occurRow in enumerate(occurrence):

#     #This will contain the strength of the row with other rows. 
#     newStrengthRow = [None] * df.shape[0] 
#     # Look for the other occurRows
#     for idx1, occurRow1 in enumerate(occurrence):

#         #Only look at other rows. Only fill top half of the matrix (it is symmetrical on the diagonal)
#         if idx != idx1 and idx1 > idx:
#             strength = sum(np.array(occurRow) & np.array(occurRow1))
#             newStrengthRow[idx1] = strength
        
#     strengthMatrix[idx] = newStrengthRow


# for i in strengthMatrix:
#     print(i)






In [10]:
#Outer array to contain the strength of connection between different rows. 
strengthMatrix = [None] * df.shape[0]


for idx, occurRow in enumerate(occurrence):

    #This will contain the strength of the row with other rows. 
    #Since the matrix will be symmetrical across the diagonal, we only fill one side of the matrix. 
    newStrengthRow = [None] * (df.shape[0] - idx - 1)
    # Look for the other occurRows
    for idx1, occurRow1 in enumerate(occurrence):

        #Only look at other rows.
        if idx != idx1 and idx1 > idx:
            strength = sum(np.array(occurRow) & np.array(occurRow1)) #Use the & operator to check for both true. Sum everything to get number of positives. 
            newStrengthRow[idx1-idx-1] = strength #Need to adjust based on the array init
        
    strengthMatrix[idx] = newStrengthRow


# for i in strengthMatrix:
#     print(i)


[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 3, 1, 0, 0, 1,

In [13]:
#The idea to check if both array have true at the same index. 

test1 = [True, True, False]
test2 = [True, True, False]

print(sum(np.array(test1) & np.array(test2)))

2


In [14]:
df["Themes"].value_counts()

Themes
clinical         8
psychodynamic    8
self-help        7
social           6
behavioural      6
cognitive        6
developmental    3
personality      3
biological       3
Name: count, dtype: int64

In [15]:
#Creatlng a color list for different themes

df["ThemeColor"] = df["Themes"].map({"clinical": "#03A9F4", "psychodynamic" : "#7B1FA2", "self-help": "#7CB342", "social" : "#FFB300", "behavioural": "#6D4C41", "cognitive" : "#546E7A", "developmental" : "#3F51B5", "personality": "#D32F2F", "biological": "#33691E"})

In [16]:
df.head()

Unnamed: 0,Title,Original Publication Year,Author,Summary,Keywords,Additional Keywords,Average Rating,Number of Pages,Publisher,ISBN,ISBN13,Themes,Additional Keywords List,ThemeColor
0,Understanding Human Nature,1927,Alfred Adler,Originally published in 1927 this book attempt...,"inferiority complex, character","development, goal",3.96,224,Fawcett,449308332,9780449308332,developmental,"[development, goal]",#3F51B5
1,The Nature of Prejudice,1954,Gordon W. Allport,With profound insight into the complexities of...,"prejudice, race, discrimination","violence, rationality",4.24,575,Basic Books,201001799,9780201001792,social,"[violence, rationality]",#FFB300
2,Self-Efficacy: The Exercise of Control,1997,Albert Bandura,With over 20 years of research by renowned psy...,"self-system, self-esteem, self-efficacy","goal, self",4.12,604,Worth Publishers,716728508,9780716728504,behavioural,"[goal, self]",#6D4C41
3,The Gift of Fear: Survival Signals That Protec...,1997,Gavin de Becker,"A date won't take ""no"" for an answer. The new ...",fear,"violence, intuition, crime",4.18,352,Bloomsbury Publishing PLC,747538352,9780747538356,social,"[violence, intuition, crime]",#FFB300
4,Games People Play,1964,Eric Berne,"Forty years ago, Games People Play revolutioni...","stroke, transactional analysis, games",social,3.74,192,Penguin,345032799,9780345032799,social,[social],#FFB300


In [17]:
#Possible viz library: https://pyvis.readthedocs.io/en/latest/documentation.html, https://github.com/imohitmayank/jaal

In [18]:
from pyvis import network
import networkx as nx

In [19]:
#Formatting the title (popup)
    
import textwrap

def formatTitle(title, year, author, summary, keywords, averageRating):

    wrapped_summary = textwrap.fill(summary, 80)
    return (f"""Title: {title}
            Year: {year}
            Author: {author}
            Average Rating (Goodreads): {averageRating}
            Keywords: {keywords}
            Summary: {wrapped_summary}
    """)

    


In [359]:
options = '''
    {
        "autoResize": false,
        "physics": {
            "maxVelocity" : 5,
            "timestep" : 0.25,
            "barnesHut" : {
                "springLength": 30,
                "springConstant": 0.01,
                "centralGravity" : 0.5
            }
        },
        "layout" : {
            "randomSeed" : 62,
            "clusterThreshold" : 50
        }, 
        "interaction": {
            "navigationButtons" : true
        }
    }
'''



In [360]:
net = network.Network(notebook= True, select_menu= True, filter_menu= True, neighborhood_highlight= True)

#Need to loop, so we can set more properties for nodes
for idx, row in df.iterrows():

    net.add_node(n_id = idx, label = row["Title"], color = row["ThemeColor"], title = formatTitle(row['Title'], row['Original Publication Year'], row['Author'],row['Summary'], row['Keywords'], row['Average Rating']), font = {"face" : "tahoma", "size" : 15},  widthConstraint = 200, physics = True)   

#Add edges
#For each source node
for idx, row in enumerate(strengthMatrix):
    
    #Add a link to a destination node if its not 0. 
    for idx1, val in enumerate(row):
        
        #For the destination node, since the strength matrix is only filled to the diagonal, the indexing must offset the start position is actually the source.
        if val != 0:

            #print(f" idx/source: {idx}, idx1: {idx1}, destination: {idx+idx1+1}")
            net.add_edge(int(idx), int(idx+idx1+1), value = int(val), color = { "color": '#607D8B', "highlight" : "#000000" } , physics = True, scaling = {"max" : 5}, length = 300, dashes = True, chosen = {"edge.dashes" : False})


net.set_options(options)



In [361]:
#To add legend: https://github.com/WestHealth/pyvis/issues/50
x = -2000
y = -250
step = 100
numOfColors = len(list(df["Themes"].unique()))
legendLabels = ["clinical","psychodynamic" ,"self-help", "social" , "behavioural", "cognitive" , "developmental", "personality", "biological"]
legendColor = ["#03A9F4","#7B1FA2", "#7CB342", "#FFB300", "#6D4C41", "#546E7A", "#3F51B5",  "#D32F2F", "#33691E"]
xCoords = [x] * numOfColors
yCoords = [ y + (i * step) for i in range(numOfColors)]
for i in range(numOfColors):
    net.add_node(legendLabels[i], label = legendLabels[i], color = legendColor[i], x = xCoords[i], y = yCoords[i], physics = False, fixed = {x : False, y : False}, shape = 'square' )

In [362]:
#net.toggle_physics(True)
net.show("index.html")

PsychGraph.html


In [363]:
#Commented out the this part because it makes the center node jittery. The physics does not work well because the node keeps moving straight back to center position upon any interaction with other nodes. 
#Lets adjust the center of the graph to a node which has the most connections.
# import math
# adjList = net.get_adj_list()
# max = -math.inf
# maxNode = None
# for key in adjList:
#     if len(adjList[key]) > max:
#         max = len(adjList[key])
#         maxNode = key

# print(f'The node id with the highest degree of {max} is {maxNode}')

In [364]:
# net = network.Network(notebook= True, select_menu= True, filter_menu= True, neighborhood_highlight= True)

# #Need to loop, so we can set more properties for nodes
# for idx, row in df.iterrows():

#     #if the node is the highest degree:
#     if idx == maxNode:
#         print(idx)
#         print(row["Title"])
#         net.add_node(n_id = idx, x = 0, y = 0,label = row["Title"], color = row["ThemeColor"], title = formatTitle(row['Title'], row['Original Publication Year'], row['Author'],row['Summary'], row['Keywords'], row['Average Rating']), font = {"face" : "tahoma", "size" : 15},  widthConstraint = 200, physics = True)

#     else:
#         net.add_node(n_id = idx, label = row["Title"], color = row["ThemeColor"], title = formatTitle(row['Title'], row['Original Publication Year'], row['Author'],row['Summary'], row['Keywords'], row['Average Rating']), font = {"face" : "tahoma", "size" : 15},  widthConstraint = 200, physics = True)

    

# #Add edges
# #For each source node
# for idx, row in enumerate(strengthMatrix):
    
#     #Add a link to a destination node if its not 0. 
#     for idx1, val in enumerate(row):
        
#         #For the destination node, since the strength matrix is only filled to the diagonal, the indexing must offset the start position is actually the source.
#         if val != 0:

#             #print(f" idx/source: {idx}, idx1: {idx1}, destination: {idx+idx1+1}")
#             net.add_edge(int(idx), int(idx+idx1+1), value = int(val), color = { "color": '#607D8B', "highlight" : "#000000" } , physics = True, scaling = {"max" : 5}, length = 300, dashes = True, chosen = {"edge.dashes" : False})


# #Change options for the graph
# net.set_options(options)

In [365]:
# #To add legend: https://github.com/WestHealth/pyvis/issues/50
# x = -2000
# y = -250
# step = 100
# numOfColors = len(list(df["Themes"].unique()))
# legendLabels = ["clinical","psychodynamic" ,"self-help", "social" , "behavioural", "cognitive" , "developmental", "personality", "biological"]
# legendColor = ["#03A9F4","#7B1FA2", "#7CB342", "#FFB300", "#6D4C41", "#546E7A", "#3F51B5",  "#D32F2F", "#33691E"]
# xCoords = [x] * numOfColors
# yCoords = [ y + (i * step) for i in range(numOfColors)]
# for i in range(numOfColors):
#     net.add_node(legendLabels[i], label = legendLabels[i], color = legendColor[i], x = xCoords[i], y = yCoords[i], physics = False, fixed = {x : False, y : False}, shape = 'square' )

In [366]:
# #net.toggle_physics(True)
# net.show("PsychGraph.html")