In [60]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [61]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [62]:
df=pd.read_csv("quotes.csv")

In [63]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,"Anarchy means 'without leaders', not 'without ...",V,V for Vendetta,Alan Moore,1982
3,Everybody is special. Everybody. Everybody is ...,V,V for Vendetta,Alan Moore,1982
4,"Happiness is a prison, Evey. Happiness is the ...",V,V for Vendetta,Alan Moore,1982


### Extract Metadata

#### Numeric Features

In [64]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [65]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [66]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [67]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
1216,I guess this is a bad time to mention I hung a...,Katniss,Catching Fire,Suzanne Collins,2009,93,20,3,7,1,0,0
242,Who says stories cannot bring things to life?,,How to Seize a Dragon's Jewel,Cressida Cowell,2012,45,8,1,0,0,0,1
880,"'Tis strange, — but true; for truth is always ...",Narrator,Don Juan,Lord Byron,1823,77,13,1,8,1,0,0
663,But I can assure you that Lizzy does not lose ...,Mrs. Bennet,Pride and Prejudice,Jane Austen,1813,119,24,1,11,2,0,0
491,The praise of the praiseworthy is above all re...,,The Two Towers,J. R. R. Tolkien,1954,52,9,1,0,0,0,0
348,A person doesn't die when he should but when h...,Colonel Aureliano Buendía,One Hundred Years of Solitude,Gabriel García Márquez,1967,52,11,1,25,3,0,0


Sentiment analysis

In [68]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_scores(text):
    scores = analyzer.polarity_scores(text)
    return scores['pos'], scores['neg'], scores['neu']

# Apply the function to each row
df['sentiment_positive'], df['sentiment_negative'], df['sentiment_neutral'] = zip(*df['quotetext'].apply(get_sentiment_scores))

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_positive,sentiment_negative,sentiment_neutral
35,Happiness has got to be paid for.,The Controller,Brave New World,Aldous Huxley,1932,33,7,1,14,2,0,0,0.375,0.0,0.625


Flesch reading ease

In [69]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [70]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [71]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_positive,sentiment_negative,sentiment_neutral,readability_score,lexical_diversity
339,No amount of fire or freshness can challenge w...,,The Great Gatsby,F. Scott Fitzgerald,1925,91,18,1,0,0,0,0,0.228,0.191,0.581,87.05,1.0
277,"I had the epiphany that laughter was light, an...",,The Goldfinch,Donna Tartt,2013,113,21,1,0,0,0,0,0.252,0.0,0.748,84.0,0.714286
1170,You are exactly my brand of heroin.,Edward Cullen,Twilight,Stephenie Meyer,2005,35,7,1,13,2,0,0,0.0,0.348,0.652,81.29,1.0
1220,I'm going to stay right here and cause all kin...,Katniss,Catching Fire,Suzanne Collins,2009,69,14,2,7,1,0,0,0.0,0.172,0.828,99.57,1.0
265,The most dangerous enemy is that which no one ...,,Angels & Demons,Dan Brown,2000,52,10,1,0,0,1,0,0.13,0.537,0.333,78.25,1.0
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012,33,6,1,0,0,0,0,0.275,0.0,0.725,99.23,1.0
