In [1]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [3]:
df=pd.read_csv("quotes.csv")

In [4]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,"Anarchy means 'without leaders', not 'without ...",V,V for Vendetta,Alan Moore,1982
3,Everybody is special. Everybody. Everybody is ...,V,V for Vendetta,Alan Moore,1982
4,"Happiness is a prison, Evey. Happiness is the ...",V,V for Vendetta,Alan Moore,1982


### Extract Metadata

#### Numeric Features

In [5]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [6]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [7]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [8]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
901,You don't get to choose if you get hurt in thi...,Hazel Grace,The Fault in Our Stars,John Green,2012,117,25,5,11,2,0,0
1276,How did you die? - We er... drowned in a batht...,Percy Jackson and the Stoll brothers,The Titan's Curse,Rick Riordan,2007,93,22,7,36,6,0,2
908,Maybe everbody in the whole damn world is scar...,George,Of Mice and Men,John Steinbeck,1937,63,12,1,6,1,0,0
1241,Every man the image of every other; then all a...,Faber,Fahrenheit 451,Ray Bradbury,1953,102,20,1,5,1,0,0
1079,You can only be jealous of someone who has som...,,The Handmaid's Tale,Margaret Atwood,1985,90,17,1,0,0,0,0
496,The world was reduced to the surface of her sk...,,One Hundred Years of Solitude,Gabriel García Márquez,1967,97,19,1,0,0,0,0


Sentiment analysis

In [9]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
1207,"If you wish to marry suitably, marry your equal.",,The Heroides,Ovid,1989,48,9,1,0,0,0,0,0.4019


Flesch reading ease

In [10]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [11]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [12]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
1311,"When you can’t find someone to follow, you hav...",,Bad Feminist,Roxane Gay,2014,81,17,1,0,0,0,0,0.0,88.06,0.764706
1433,The silence depressed me. It wasn't the silenc...,,The Bell Jar,Sylvia Plath,1963,82,15,3,0,0,0,0,-0.5106,83.32,0.8
881,Because you are beautiful. I enjoy looking at ...,Augustus Waters,The Fault in Our Stars,John Green,2012,144,25,2,15,2,0,0,0.9426,58.79,0.96
1039,The drying up a single tear has more Of honest...,Narrator,Don Juan,Lord Byron,1823,79,16,1,8,1,0,0,0.7745,80.62,1.0
170,Sometimes I think the things I remember are mo...,,Memoirs of a Geisha,Arthur Golden,1997,76,15,1,0,0,0,0,0.0,81.63,0.733333
1423,"You know, you could live a thousand lifetimes ...",,Catching Fire,Suzanne Collins,2009,66,12,1,0,0,0,0,0.0,84.68,1.0


In [13]:
df.to_csv("quotes_with_features.csv", index=False)