In [66]:
pip install vaderSentiment textstat nltk


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [67]:
import pandas as pd
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textstat


In [68]:
df=pd.read_csv("quotes.csv")

In [69]:
df.head()

Unnamed: 0,quotetext,character,title,author,year
0,Maybe 'okay' will be our 'always.,,'The Fault in Our Stars','John Green',2012
1,I wonder what Piglet is doing. I wish I were t...,Pooh,Winnie-the-Pooh,A.A. Milne,1926
2,Be sure thy sin will find thee out.,,And Then There Were None,Agatha Christie,1939
3,Best of an island is once you get there - you ...,,And Then There Were None,Agatha Christie,1939
4,"But no artist, I now realize, can be satisfied...",,And Then There Were None,Agatha Christie,1939


In [70]:
import time

start_time = time.time()

### Extract Metadata

#### Numeric Features

In [71]:
df['quotelength'] = df['quotetext'].apply(lambda x: len(x))
df['quote_words'] = df['quotetext'].apply(lambda x: len(x.split()))
df['quote_sentences'] = df['quotetext'].apply(lambda x: x.count('.')+x.count('?')+x.count('!'))

In [72]:
df["characterlength"] = df["character"].fillna("").apply(lambda x: len(x))
df["character_words"] = df["character"].fillna("").apply(lambda x: len(x.split()))

In [73]:
df["quote_exclemation"] = df["quotetext"].apply(lambda x: x.count('!'))
df["quote_question"] = df["quotetext"].apply(lambda x: x.count('?'))

In [74]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question
1754,"Be cunning, and full of tricks, and your peopl...",,Watership Down,Richard Adams,1972,72,13,1,0,0,0,0
297,"She was made up of more, too. She was the book...",Francie,A Tree Grows in Brooklyn,Betty Smith,1943,72,16,2,7,1,0,0
431,It’s overwhelming. I feel as if I am living in...,Eragon,Eragon,Christopher Paolini,2003,98,19,2,6,1,0,0
1342,He knows that there's no better way in the wor...,,One Flew Over the Cuckoo's Nest,Ken Kesey,1962,148,28,1,0,0,0,0
2010,"But then... it used to be so simple, once upon...",,Witches Abroad,Terry Pratchett,1991,54,12,4,0,0,0,0
1163,"Depend upon it, you see but half. You see the ...",,Mansfield Park,Jane Austen,1814,479,84,3,0,0,0,0


Sentiment analysis

In [75]:
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment compound score
def get_sentiment_compound(text):
    score = analyzer.polarity_scores(text)
    return score['compound']

# Apply the function to each row
df['sentiment_compound'] = df['quotetext'].apply(get_sentiment_compound)

df.sample()

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound
1306,It echoed loudly within him because he was hol...,,Heart of Darkness,Joseph Conrad,1899,62,12,1,0,0,0,0,0.0


Flesch reading ease

In [76]:
df['readability_score'] = df['quotetext'].apply(textstat.flesch_reading_ease)


In [77]:
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0


df['lexical_diversity'] = df['quotetext'].apply(lexical_diversity)

In [78]:
df.sample(6)

Unnamed: 0,quotetext,character,title,author,year,quotelength,quote_words,quote_sentences,characterlength,character_words,quote_exclemation,quote_question,sentiment_compound,readability_score,lexical_diversity
1252,When however small a measure of jealousy is mi...,,A Prayer for Owen Meany,John Irving,1989,109,18,1,0,0,0,0,-0.7783,53.21,0.944444
2041,"The walls of the cell fell away, the sky came ...",,In Cold Blood,Truman Capote,1966,78,17,1,0,0,0,0,0.0,96.52,0.882353
220,"Life, woman, life is God's most precious gift;...",,The Crucible,Arthur Miller,1953,108,18,1,0,0,0,0,0.8723,53.21,1.0
1086,It's the job that's never started as takes lon...,Sam,The Fellowship of the Ring,J.R.R. Tolkien,1954,61,11,1,3,1,0,0,0.0,77.23,1.0
982,"If a girl looks swell when she meets you, who ...",Holden Caulfield,The Catcher in the Rye,J.D. Salinger,1951,73,16,1,16,2,0,1,-0.4019,106.0,0.9375
1866,He was small for one who had lived so many sun...,Karana,Island of the Blue Dolphins,Scott O'Dell,1960,82,18,1,6,1,0,0,0.0,95.51,1.0


In [79]:
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.2808828353881836 seconds


In [80]:
df.to_csv("quotes_with_features.csv", index=False)