In [14]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [16]:
df=pd.read_csv("quotes.csv")

In [17]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,Be sure thy sin will find thee out.,,And Then There Were None,Agatha Christie,1939
3,Best of an island is once you get there - you ...,,And Then There Were None,Agatha Christie,1939
4,"But no artist, I now realize, can be satisfied...",,And Then There Were None,Agatha Christie,1939


### Extract Metadata

#### Numeric Features

In [18]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [19]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [20]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [21]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
113,"Anyhow, I've learned one thing now. You only r...",Anne Frank,The Diary of a Young Girl,Anne Frank,1947,114,22,2,10,2,0,0
569,One learns from books and example only that ce...,,Children of Dune,Frank Herbert,1976,122,21,2,0,0,0,0
533,I shall live forever and ever and ever. I shal...,,The Secret Garden,Frances Hodgson Burnett,1911,91,17,2,0,0,0,0
1220,You’ve been here before. It won’t kill you. It...,,The Husband's Secret,Liane Moriarty,2013,108,19,3,0,0,0,0
85,What a fool I was not to tear my heart out on ...,Edmond,The Count of Monte Cristo,Alexandre Dumas,1844,87,20,1,6,1,1,0
1030,You are so busy being YOU that you have no ide...,Augustus Waters,The Fault in Our Stars,John Green,2012,82,16,1,15,2,0,0


Sentiment analysis

In [22]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
139,You can be lonely even when you are loved by m...,Anne Frank,The Diary of a Young Girl,Anne Frank,1947,105,21,1,10,2,0,0,0.34


Flesch reading ease

In [23]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [24]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [25]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
844,Your parents gave their lives to keep you aliv...,Professor Lupin,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,1999,81,16,2,15,2,0,0,-0.128,97.2,0.9375
503,You're an expatriate. You've lost touch with t...,,The Sun Also Rises,Ernest Hemingway,1926,200,33,6,0,0,0,0,-0.9201,82.81,0.909091
1352,All endings are also beginnings. We just don’t...,,The Five People You Meet In Heaven,Mitch Albom,2003,67,13,2,0,0,0,0,0.0,90.26,1.0
490,"I mistrust all frank and simple people, especi...",,The Sun Also Rises,Ernest Hemingway,1926,84,13,1,0,0,0,0,0.0,49.82,1.0
393,I'm the best and cleanest witch in Ingary.,Sophie,Howl's Moving Castle,Diana Wynne Jones,1986,42,8,1,6,1,0,0,0.4019,88.74,1.0
1684,Fear doesn't shut you down; it wakes you up.,Four,Divergent,Veronica Roth,2011,44,9,1,4,1,0,0,-0.4939,104.64,0.888889


In [26]:
df.to_csv("quotes_with_features.csv", index=False)