In [40]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [42]:
df=pd.read_csv("quotes.csv")

In [43]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,Be sure thy sin will find thee out.,,And Then There Were None,Agatha Christie,1939
3,Best of an island is once you get there - you ...,,And Then There Were None,Agatha Christie,1939
4,"But no artist, I now realize, can be satisfied...",,And Then There Were None,Agatha Christie,1939


### Extract Metadata

#### Numeric Features

In [44]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [45]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [46]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [47]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
1752,"Life is only precious because it ends, kid.",Leo Valdez,The Blood of Olympus,Rick Riordan,2014,43,8,1,10,2,0,0
1793,"Man is not truly one, but truly two.",,The Strange Case of Dr. Jekyll and Mr. Hyde,Robert Louis Stevenson,1886,36,8,1,0,0,0,0
973,It will happen tonight. The Dark Lord lies alo...,Professor Trelawney,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,1999,102,17,2,19,2,0,0
66,"Words can be like X-rays, if you use them prop...",Helmholtz Watson,Brave New World,Aldous Huxley,1932,110,20,2,16,2,0,0
767,Beautiful people spend time discovering what t...,,Love Warrior,Glennon Doyle,2016,242,43,2,0,0,0,0
379,Suffering has been stronger than all other tea...,Pip,Great Expectations,Charles Dickens,1861,166,31,2,3,1,0,0


Sentiment analysis

In [48]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
139,You can be lonely even when you are loved by m...,Anne Frank,The Diary of a Young Girl,Anne Frank,1947,105,21,1,10,2,0,0,0.34


Flesch reading ease

In [49]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [50]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [51]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
1572,"We dream in our waking moments, and walk in ou...",,The Scarlet Letter,Nathaniel Hawthorne,1850,54,11,1,0,0,0,0,0.25,94.15,0.818182
1864,Carlisle Cullen. Looking at him without that h...,Jacob Black,Breaking Dawn,Stephenie Meyer,2008,188,33,6,11,2,0,0,0.9464,88.43,0.878788
1278,All I know is this: nobody's very big in the f...,,One Flew Over the Cuckoo's Nest,Ken Kesey,1962,144,27,1,0,0,0,0,0.3612,77.91,0.962963
1725,People are more difficult to work with than ma...,Hephaestus,The Battle of the Labyrinth,Rick Riordan,2008,101,19,2,10,1,0,0,-0.4201,87.21,1.0
1069,"He was not handsome, and his manners required ...",,Sense and Sensibility,Jane Austen,1811,235,39,2,0,0,0,0,0.9191,51.68,0.820513
594,It's silly not to hope. It's a sin.,Santiago,The Old Man and the Sea,Ernest Hemingway,1952,35,8,2,8,1,0,0,-0.7101,109.72,0.875


In [52]:
df.to_csv("quotes_with_features.csv", index=False)