In [1]:
import pandas as pd
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

with open("01 Harry Potter and the Sorcerers Stone.txt", "r", encoding="utf8") as f:
    content = f.read()

doc = nlp(content)

In [2]:
from spacy import displacy
displacy.render(list(doc.sents)[:15], style="ent")




In [3]:
characters = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

character_counts = Counter(characters)

print(character_counts.most_common(10))

[('Harry', 1217), ('Ron', 396), ('Dudley', 131), ('Dumbledore', 123), ('McGonagall', 98), ('Uncle Vernon', 77), ('Malfoy', 72), ('Quirrell', 68), ('Dursley', 50), ('Gryffindor', 42)]


In [4]:
A = [(token.text, token.idx, token.idx + len(token.text), token.vector, token.ent_type_) for token in doc if token.pos_ == "ADJ" or token.ent_type_ == "PERSON"]
df = pd.DataFrame(A, columns=['text', 'start', 'end', 'vector', 'ent_type'])
df.head()

Unnamed: 0,text,start,end,vector,ent_type
0,Dursley,14,21,"[-1.1005309, -0.6832987, -1.3269522, 0.0094849...",PERSON
1,proud,58,63,"[0.55462456, -0.5869921, -1.2396122, -1.544119...",
2,normal,96,102,"[1.6234715, -0.45702282, 0.39489457, 0.0211604...",
3,last,139,143,"[0.24862953, -1.7796347, -0.79658234, 1.183322...",
4,strange,191,198,"[1.8782678, 0.02183574, -0.85974884, -0.689292...",


In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Filter out only the persons and their associated adjectives
persons = df[df['ent_type'] == 'PERSON']

# Extract adjective vectors for each person
person_adj_vectors = {}
for person in persons['text'].unique():
    adj_vectors = [token.vector for token in doc if token.head.text == person and token.pos_ == 'ADJ']
    if adj_vectors:
        person_adj_vectors[person] = adj_vectors

# Compute the average vector for each person
average_vectors = {person: sum(vectors) / len(vectors) for person, vectors in person_adj_vectors.items()}

# Prepare data for clustering
data = list(average_vectors.values())
names = list(average_vectors.keys())

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(data_scaled)
clusters = kmeans.labels_

# Create a DataFrame with the results
results = pd.DataFrame({'person': names, 'cluster': clusters})

# Display the results
results

Unnamed: 0,person,cluster
0,Dursley,2
1,Dudley,2
2,Potter,2
3,Harry,2
4,Vernon,1
5,Petunia,1
6,Figg,2
7,’s,1
8,felt,1
9,one,2


In [6]:
import re

# Function to extract sentences containing character names
def extract_sentences(doc, characters):
    sentences = []
    for sent in doc.sents:
        for character in characters:
            if character in sent.text:
                sentences.append((sent.text, character))
                break
    return sentences

# Extract sentences
sentences = extract_sentences(doc, characters)

# Create a DataFrame for the extracted sentences
df = pd.DataFrame(sentences, columns=["sentence", "character"])

In [7]:
df.head()

Unnamed: 0,sentence,character
0,"M r. and Mrs. Dursley, of number four, Privet ...",Dursley
1,Mr. Dursley was the director of a firm called ...,Dursley
2,Mrs. Dursley was thin and blonde and had nearl...,Dursley
3,The Dursleys had a small son called Dudley and...,Dursley
4,"The Dursleys had everything they wanted, but t...",Dursley


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the character names
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["sentence"])


In [9]:
from sklearn.cluster import KMeans

# Number of clusters (houses and muggles)
num_clusters = 5

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

In [10]:
df.head()

Unnamed: 0,sentence,character,Cluster
0,"M r. and Mrs. Dursley, of number four, Privet ...",Dursley,0
1,Mr. Dursley was the director of a firm called ...,Dursley,2
2,Mrs. Dursley was thin and blonde and had nearl...,Dursley,2
3,The Dursleys had a small son called Dudley and...,Dursley,2
4,"The Dursleys had everything they wanted, but t...",Dursley,1
