# ToGitOrNotToGit üíÄ

## EDA üé≠ `creators` dataset

1. Import libraries   
2. Load dataset   
3. Quick overview  
    - Distribution of Years (Birth & Death)   
    - Periods & Genres  
    - Iconic Creatures & Visual Motifs  
    - English Roots  
    - Major Works   

---

üé≠ `creators.md` ‚Üí who writes  
‚ú® `creatures.md` ‚Üí who acts  
üó£Ô∏è `dark_stage.md` ‚Üí where transgression unfolds  

In [None]:
# ------------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------------
import pandas as pd                 # === CORE EDA ===
import numpy as np

import matplotlib.pyplot as plt     # === VISUALIZATION ===
import seaborn as sns
import plotly.express as px

import textwrap                     # === TEXT / Light NLP ===
import re
from collections import Counter
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer    # === MACHINE LEARNING ===
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from tabulate import tabulate       # === ENHANCED DISPLAY ===
from rich import print as rprint

plt.style.use("default")            # === DISPLAY SETTING ===
sns.set_theme()


# ------------------------------------------------------------------
# 2. Load dataset
# ------------------------------------------------------------------
df_creators = pd.read_csv("../../data/raw/creators_raw_dataset.csv")

In [None]:
# ------------------------------------------------------------------
# 3. Quick overview
# ------------------------------------------------------------------
print("SHAPE :", df_creators.shape)

In [None]:
df_creators.info()

In [None]:
pd.options.display.float_format = '{:.2f}'.format
df_creators.describe(include="all")

In [None]:
df_creators.head()

In [None]:
# ------------------------------------------------------------------
# Distribution of Years (Birth & Death)
# ------------------------------------------------------------------
# === Birth and Death Year Distribution ===
plt.figure(figsize=(12,5))                                  

plt.subplot(1,2,1)                                          # Birth years histogram
sns.histplot(df_creators['Birth'], bins=10, kde=True, color='skyblue')
plt.title("BIRTH YEAR DISTRIBUTION")
plt.xlabel("Year")
plt.ylabel("Count")
      
plt.subplot(1,2,2)                                          # Death years histogram
sns.histplot(df_creators['Death'], bins=10, kde=True, color='salmon')
plt.title("DEATH YEAR DISTRIBUTION")
plt.xlabel("Year")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
# === Calculate Age at Death ===
# Each author's lifespan, ordered chronologically by birth, to easily see generational patterns.

df_creators['Age'] = df_creators['Death'] - df_creators['Birth']


df_age_sorted = df_creators.sort_values('Birth')        # Sort by Birth year

plt.figure(figsize=(12,6))
plt.barh(df_age_sorted['Author'], df_age_sorted['Age'], color=plt.cm.viridis(np.linspace(0,1,len(df_age_sorted))))
plt.xlabel("Age")
plt.ylabel("Author")
plt.title("AUTHORS' AGE at DEATH (sorted by Birth Year)")
plt.gca().invert_yaxis()  # Oldest at top
plt.show()

In [None]:
# === Create sorted list with lifespan ===
df_age_sorted = df_creators[['Author', 'Birth', 'Death']].copy()
df_age_sorted['Age'] = df_age_sorted['Death'] - df_age_sorted['Birth']

df_age_sorted = df_age_sorted.sort_values('Birth')       # Sort by Birth year

display(df_age_sorted)

In [None]:
print("=== 3 YOUNGEST at DEATH ===")
display(df_age_sorted.nsmallest(3, 'Age'))

In [None]:
print("=== 3 OLDEST at DEATH ===")
display(df_age_sorted.nlargest(3, 'Age'))

In [None]:
# === Dramatic, literary print ===
for idx, row in df_age_sorted.iterrows():
    author = row['Author']
    birth = row['Birth']
    death = row['Death']
    age = row['Age']
    
    if age < 30:
        fate = "üíÄ Cut tragically short"
    elif age > 70:
        fate = "üåπ Blessed with longevity"
    else:
        fate = "üïØÔ∏è Lived a measured life"
    
    print(f"{author} ({birth}-{death}) / {age} / {fate}")

In [None]:
# === Sort authors by Birth and reset index ===
df_timeline = df_creators[['Author','Birth','Death']].copy()
df_timeline['Age'] = df_timeline['Death'] - df_timeline['Birth']
df_timeline = df_timeline.sort_values('Birth').reset_index(drop=True)

# === Plot Timeline ===
plt.figure(figsize=(14,8))

for idx, row in df_timeline.iterrows():
    plt.plot([row['Birth'], row['Death']], [idx, idx], color='darkred', linewidth=4)
    plt.scatter(row['Birth'], idx, color='green', s=50)  # Birth
    plt.scatter(row['Death'], idx, color='black', s=50)  # Death

plt.yticks(range(len(df_timeline)), df_timeline['Author'])
plt.xlabel("Year")
plt.title("AUTHORS TIMELINE : Birth ‚Üí Death")
plt.gca().invert_yaxis()  # Earliest birth on top
plt.show()


In [None]:
# ------------------------------------------------------------------
# Periods & Genres
# ------------------------------------------------------------------
# === Split Periods and Genres into lists ===
df_creators['Period_list'] = df_creators['Period'].str.split(',\s*')
df_creators['Genre_list'] = df_creators['Genre'].str.split(',\s*')

In [None]:
from collections import Counter               # Count occurrences

import matplotlib as mpl
mpl.rcParams['font.family'] = 'Segoe UI Emoji'

# === Flatten all periods & genres for counting ===
all_periods = [p for sublist in df_creators['Period_list'] for p in sublist]
all_genres = [g for sublist in df_creators['Genre_list'] for g in sublist]

period_counts = Counter(all_periods)
genre_counts = Counter(all_genres)

# === Periods Bar Chart ===
plt.figure(figsize=(10,5))
colors = plt.cm.magma(np.linspace(0,1,len(period_counts)))
plt.bar(period_counts.keys(), period_counts.values(), color=colors)
plt.title("üëë AUTHORS by PERIOD (Elizabethan ‚Üí Jacobean ‚Üí Caroline)", fontsize=14)
plt.ylabel("Nber of Authors")
plt.xlabel("Period")
plt.show()

In [None]:
all_genres = [g for sublist in df_creators['Genre_list'] for g in sublist]   # Flatten all genres into a single list

unique_genres = sorted(set(all_genres))                                      # Get unique genres

print("üé≠ GENRES :")
for genre in unique_genres:
    print("-", genre)

In [None]:
# === Genres Bar Chart ===
plt.figure(figsize=(12,5))
colors = plt.cm.cividis(np.linspace(0,1,len(genre_counts)))
plt.bar(genre_counts.keys(), genre_counts.values(), color=colors)
plt.title("üé≠ AUTHORS by GENRE", fontsize=14)
plt.ylabel("Count")
plt.xlabel("Genre")
plt.xticks(rotation=45)
plt.show()

In [None]:
# ------------------------------------------------------------------
# Iconic Creatures & Visual Motifs
# ------------------------------------------------------------------
# Flatten lists for counting
df_creators['IconicCreatures_list'] = df_creators['Iconic Creatures'].str.split(',\s*')        # Split by commas or spaces if needed
df_creators['VisualMotifs_list'] = df_creators['Visual Motifs'].str.split(',\s*')

all_creatures = [c for sublist in df_creators['IconicCreatures_list'] for c in sublist]        # Flatten all
all_motifs = [m for sublist in df_creators['VisualMotifs_list'] for m in sublist]

creature_counts = Counter(all_creatures)                                                       # Count frequency
motif_counts = Counter(all_motifs)

In [None]:
# === WordCloud for Creatures ===
wc_creatures = WordCloud(width=800, height=400, background_color='black', colormap='Reds')
wc_creatures.generate_from_frequencies(creature_counts)

plt.figure(figsize=(12,6))
plt.imshow(wc_creatures, interpolation='bilinear')
plt.axis('off')
plt.title("‚ú® ICONIC CREATURES", fontsize=16)
plt.show()

In [None]:
# ------------------------------------------------------------------
# English Roots
# ------------------------------------------------------------------
birthplace_counts = df_creators['Birthplace'].value_counts()

print("üè∞ AUTHORS per BIRTHPLACE :")
print(birthplace_counts)

In [None]:
london_authors = df_creators[df_creators['Birthplace'] == 'London']
london_authors[['Author', 'Period', 'Genre']]

In [None]:
plt.figure(figsize=(12,3))
colors = plt.cm.viridis(np.linspace(0,1,len(birthplace_counts)))
bars = plt.bar(birthplace_counts.index, birthplace_counts.values, color=colors)
plt.title("üè∞ AUTHORS per BIRTHPLACE", fontsize=16)
plt.ylabel("Nber of Authors")
plt.xlabel("Birthplace")
plt.xticks(rotation=45)
plt.show()

In [None]:
# ------------------------------------------------------------------
# Major Works
# ------------------------------------------------------------------
titles_text = ' '.join(df_creators['Major Works'].str.replace('*', '', regex=False))    # Clean titles : remove asterisks

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("MAJOR WORKS TITLES", fontsize=16)
plt.show()

In [None]:
titles_text = ' '.join(df_creators['Major Works'].str.replace('*','',regex=False))    # Combine all titles

tokens = re.findall(r'\b[a-zA-Z]+\b', titles_text.lower())        # Tokenize words, keep only alphabetic

stopwords = {'the', 's', 'a', 'd', 'of', 'and'}                   # Define stopwords

filtered_tokens = [t for t in tokens if t not in stopwords]       # Filter tokens

filtered_counts = Counter(filtered_tokens)                        # Count top words
top_filtered = filtered_counts.most_common(10)

print("Top thematic words in titles (stopwords removed):")
for word, count in top_filtered:
    print(f"{word}: {count}")

In [None]:
words_2x = [word for word, count in filtered_counts.items() if count == 2]     # Words with 2 occurrences

# Function to check if a title contains any of these words
def has_2x_word(title):  
    clean_title = re.sub(r'\*', '', title).lower()           # Clean title
    tokens = re.findall(r'\b[a-zA-Z]+\b', clean_title)
    return any(word in tokens for word in words_2x)

titles_with_2x = df_creators['Major Works'][df_creators['Major Works'].apply(has_2x_word)].tolist()     # Filter titles

print("TITLES CONTAINING WORDS WITH 2 OCCURRENCES :")
for t in titles_with_2x:
    print(t)

In [None]:
# Clean titles : remove asterisks
df_creators['Clean_Titles'] = df_creators['Major Works'].str.replace('*','',regex=False)

# Split titles into a list
df_creators['Titles_List'] = df_creators['Clean_Titles'].str.split(',')

# Compute average length per title
df_creators['Avg_Title_Length'] = df_creators['Titles_List'].apply(lambda lst: sum(len(t.strip()) for t in lst)/len(lst))

plt.figure(figsize=(12,6))
plt.barh(df_creators['Author'], df_creators['Avg_Title_Length'], 
         color=plt.cm.plasma(np.linspace(0,1,len(df_creators))))
plt.xlabel("Avg Length of Major Works Titles")
plt.title("üìú AVERAGE TITLES LENGTH per AUTHOR")
plt.gca().invert_yaxis()  # longest on top
plt.show()

In [None]:
df_creators['Clean_Titles'] = df_creators['Major Works'].str.replace('*','',regex=False)  # Clean titles and split
df_creators['Titles_List'] = df_creators['Clean_Titles'].str.split(',')

titles_expanded = df_creators[['Author','Titles_List']].explode('Titles_List')            # Create a flattened DataFrame with one row per title
titles_expanded['Title'] = titles_expanded['Titles_List'].str.strip()
titles_expanded['Title_Length'] = titles_expanded['Title'].apply(len)

titles_sorted = titles_expanded.sort_values('Title_Length', ascending=False)              # Sort by length

print("TOP 3 lONGEST TITLES :")
print(titles_sorted.head(3)[['Author','Title','Title_Length']])

print("\nTOP 3 SHORTEST TITLES :")
print(titles_sorted.tail(3)[['Author','Title','Title_Length']])

In [None]:
# Group by birthplace using average title length per title
grouped_avg = df_creators.groupby('Birthplace')['Avg_Title_Length'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,6))
grouped_avg.plot(kind='bar', color=plt.cm.cividis(np.linspace(0,1,len(grouped_avg))))
plt.ylabel("Avg Title Length per Title")
plt.title("üåç BIRTHPLACE vs AVERAGE TITLE LENGTH")
plt.show()