In [1]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [3]:
df=pd.read_csv("quotes.csv")

In [4]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,"Behind every exquisite thing that existed, the...",,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
1,Children begin by loving their parents; as the...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
2,Every portrait that is painted with feeling is...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
3,Experience is merely the name men gave to thei...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
4,Mere words! How terrible they were! One could ...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890


In [5]:
import time

start_time = time.time()

### Extract Metadata

#### Numeric Features

In [6]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [7]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [8]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [9]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
1506,Oh! do not attack me with your watch. A watch ...,,Mansfield Park,Jane Austen,1814,113,25,4,0,0,1,0
2319,Experience is what you get when you didn't get...,Randy Pausch,The Last Lecture,Randy Pausch,2008,130,24,2,12,2,0,0
1132,"Yet so vain is man, and so blinded by his vanity",,The War of the Worlds,H.G. Wells,1897,48,11,0,0,0,0,0
3051,It is nothing to die; it is dreadful not to live.,,Les Misérables,Victor Hugo,1862,49,11,1,0,0,0,0
1617,When however small a measure of jealousy is mi...,,A Prayer for Owen Meany,John Irving,1989,109,18,1,0,0,0,0
944,One uses power by grasping it lightly. To gras...,,Children of Dune,Frank Herbert,1976,121,24,2,0,0,0,0


Sentiment analysis

In [10]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
1788,So it goes.,,Slaughterhouse-Five,Kurt Vonnegut,1969,11,3,1,0,0,0,0,0.0


Flesch reading ease

In [11]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [12]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [13]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
372,Where there is no experience the wise man is s...,,Dreams from My Father: A Story of Race and Inh...,Barack Obama,1995.0,52,10,1,0,0,0,0,0.2263,86.71,0.9
2475,"Sometimes, on a very clear night, if I is swig...",BFG,The BFG,Roald Dahl,1982.0,154,29,1,3,1,0,0,0.4391,67.42,0.827586
5,The basis of optimism is sheer terror.,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890.0,38,7,1,0,0,0,0,0.0258,64.37,1.0
2512,"If I am the chief of sinners, I am the chief o...",,The Strange Case of Dr. Jekyll and Mr. Hyde,Robert Louis Stevenson,1886.0,63,14,1,0,0,0,0,-0.5267,82.65,0.642857
1146,"Great poetry, whether written in Greek or in E...",,The Story of My Life,Helen Keller,,105,17,1,0,0,0,0,0.8625,62.68,0.941176
1684,"He is taller by almost the breadth of my nail,...",Gulliver,Gulliver's Travels,Jonathan Swift,1726.0,128,26,1,8,1,0,0,-0.3612,78.93,0.884615


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.4616219997406006 seconds


In [15]:
df.to_csv("quotes_with_features.csv", index=False)