In [73]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [74]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [75]:
df=pd.read_csv("quotes.csv")

In [76]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,"Anarchy means 'without leaders', not 'without ...",V,V for Vendetta,Alan Moore,1982
3,Everybody is special. Everybody. Everybody is ...,V,V for Vendetta,Alan Moore,1982
4,"Happiness is a prison, Evey. Happiness is the ...",V,V for Vendetta,Alan Moore,1982


### Extract Metadata

#### Numeric Features

In [77]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [78]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [79]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [80]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
722,That’s the thing about books. They let you tra...,,The Namesake,Jhumpa Lahiri,2003,75,13,2,0,0,0,0
217,To know who you are without any delusions or s...,Brom,Eragon,Christopher Paolini,2003,114,20,1,4,1,0,0
184,I am not so old now as I was then. Every morni...,Ramandu,The Voyage of the Dawn Treader,C. S. Lewis,1952,107,24,2,7,1,0,0
813,"Well! I've often seen a cat without a grin, bu...",Alice,Alice's Adventures in Wonderland,Lewis Carroll,1865,120,26,3,5,1,3,0
739,Some infinities are bigger than other infiniti...,Hazel Grace,The Fault in Our Stars,John Green,2012,88,14,2,11,2,0,0
1029,Confidence is slow in reposing itself in under...,,The Heroides,Ovid,1989,71,11,1,0,0,0,0


Sentiment analysis

In [81]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
168,I sometimes think we must be all mad and that ...,,Dracula,Bram Stoker,1897,91,17,1,0,0,0,0,-0.4939


Flesch reading ease

In [82]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [83]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [84]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
142,Human beings need loyalty. It does not necessa...,,Being Mortal,Atul Gawande,2014.0,186,31,2,0,0,0,0,0.5168,64.2,0.935484
617,"There is some good in this world, and it’s wor...",,The Two Towers,J.R.R. Tolkien,1954.0,62,12,1,0,0,0,0,0.3182,101.6,1.0
842,The chief difficulty Alice found at first was ...,Alice,Alice's Adventures in Wonderland,Lewis Carroll,,71,12,1,5,1,0,0,-0.34,59.3,1.0
859,She longed to feel something momentous. Someti...,,The Husband's Secret,Liane Moriarty,2013.0,76,12,2,0,0,0,0,0.0,82.31,1.0
339,No amount of fire or freshness can challenge w...,,The Great Gatsby,F. Scott Fitzgerald,1925.0,91,18,1,0,0,0,0,0.2263,87.05,1.0
168,I sometimes think we must be all mad and that ...,,Dracula,Bram Stoker,1897.0,91,17,1,0,0,0,0,-0.4939,79.6,0.941176


In [85]:
df.to_csv("quotes_with_features.csv", index=False)