In [27]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [29]:
df=pd.read_csv("quotes.csv")

In [30]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,Be sure thy sin will find thee out.,,And Then There Were None,Agatha Christie,1939
3,Best of an island is once you get there - you ...,,And Then There Were None,Agatha Christie,1939
4,"But no artist, I now realize, can be satisfied...",,And Then There Were None,Agatha Christie,1939


### Extract Metadata

#### Numeric Features

In [31]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [32]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [33]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [34]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
411,No hunter of the sky should end his days as pr...,Saphira,Eragon,Christopher Paolini,2003,102,22,2,7,1,0,0
806,You never really understand a person until you...,,To Kill a Mockingbird,Harper Lee,1960,145,28,1,0,0,0,0
582,Perhaps as you went along you did learn someth...,,The Sun Also Rises,Ernest Hemingway,1926,88,18,2,0,0,0,0
524,"The island is ours. Here, in some way, we are ...",,We Were Liars,E. Lockhart,2014,60,12,2,0,0,0,0
253,"Reality is an absolute, existence is an absolu...",,Atlas Shrugged,Ayn Rand,1957,116,20,2,0,0,0,0
795,This has ever been the fate of energy in secur...,,The Time Machine,H.G. Wells,1895,118,23,1,0,0,0,0


Sentiment analysis

In [35]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
1759,And then Edward and I continued blissfully int...,Bella Cullen,Breaking Dawn,Stephenie Meyer,2008,92,16,1,12,2,0,0,0.7227


Flesch reading ease

In [36]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [37]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [38]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
1862,Love is or it ain’t. Thin love ain’t love at all.,,Beloved,Toni Morrison,1987,49,11,2,0,0,0,0,0.9274,116.65,0.909091
744,A lot of people lacked that gift: knowing when...,,Gone Girl,Gillian Flynn,2012,59,12,1,0,0,0,0,-0.1531,93.14,1.0
117,"Everyone thinks I'm showing off when I talk, r...",Anne Frank,The Diary of a Young Girl,Anne Frank,1947,198,37,1,10,2,0,0,-0.7964,67.76,0.702703
1606,"We have everything we need to be happy, but we...",Guy Montag,Fahrenheit 451,Ray Bradbury,1953,81,14,2,10,2,0,0,-0.0811,65.73,0.928571
1407,"Life, with its rules, its obligations, and its...",,A Wrinkle in Time,Madeleine L’Engle,1962,140,25,1,0,0,0,0,0.3291,71.48,0.88
325,One of the most cowardly things ordinary peopl...,,The Voyage of the Dawn Treader,C.S. Lewis,1952,82,16,1,0,0,0,0,-0.4391,80.62,0.9375


In [39]:
df.to_csv("quotes_with_features.csv", index=False)