In [1]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [3]:
df=pd.read_csv("quotes.csv")

In [4]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,"Behind every exquisite thing that existed, the...",,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
1,Children begin by loving their parents; as the...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
2,Every portrait that is painted with feeling is...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
3,Experience is merely the name men gave to thei...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890
4,Mere words! How terrible they were! One could ...,,"""The Picture of Dorian Gray""","""Oscar Wilde""",1890


In [5]:
import time

start_time = time.time()

### Extract Metadata

#### Numeric Features

In [6]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [7]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [8]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [9]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
2116,"Stay gold, Ponyboy, stay gold.",,The Outsiders,S. E. Hinton,1967,30,5,1,0,0,0,0
2119,It was fun to see them playing or sunning them...,Karana,Island of the Blue Dolphins,Scott O'Dell,1960,134,27,2,6,1,0,0
1628,Then he kissed her so deeply and so completely...,,The Husband's Secret,Liane Moriarty,2013,82,16,1,0,0,0,0
2111,"Man is not truly one, but truly two.",,The Strange Case of Dr. Jekyll and Mr. Hyde,Robert Louis Stevenson,1886,36,8,1,0,0,0,0
1489,"Memories, even your most precious ones, fade s...",,Never Let Me Go,Kazuo Ishiguro,2005,155,27,3,0,0,0,0
1468,"Just as the habit does not make the monk, the ...",,Blindness,José Saramago,1995,77,16,1,0,0,0,0


Sentiment analysis

In [10]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
1333,"It was evident that the gentleman, (completely...",Narrator,Persuasion,Jane Austen,1818,358,62,3,8,1,0,0,0.7506


Flesch reading ease

In [11]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [12]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [13]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
1716,Why should I be afraid now? Strange men have c...,Don Vito Corleone,The Godfather,Mario Puzo,1969,94,20,1,17,3,0,1,-0.7579,103.63,0.95
2656,What light through yonder window breaks.,Romeo,Romeo and Juliet,William Shakespeare,1597,40,6,1,5,1,0,0,0.0,90.77,1.0
1809,It is as natural to die as it is to be born.,,Preludes & Nocturnes,Neil Gaiman,1989,44,12,1,0,0,0,0,-0.34,93.14,0.75
929,"People love talking, and I have never been a h...",,Gone Girl,Gillian Flynn,2012,129,25,2,0,0,0,0,0.2519,84.17,0.96
669,They who dream by day are cognizant of many th...,,The Tell-Tale Heart and Other Writings,Edgar Allan Poe,1843,94,18,1,0,0,0,0,0.5719,87.05,0.833333
1461,What is a country? A country is a piece of lan...,,Catch-22,Joseph Heller,1961,105,19,2,0,0,0,1,0.0,70.29,0.894737


In [14]:
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.4277019500732422 seconds


In [15]:
df.to_csv("quotes_with_features.csv", index=False)