In [81]:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import re
import nltk
import string
import spacy
import heapq

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from IPython.core.display import HTML
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
pip install pdfminer.six



In [83]:
text = StringIO()
with open('/content/Think Yourself Rich_.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)

    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, text, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(text.getvalue())

******ebook converter DEMO Watermarks*******

******ebook converter DEMO Watermarks*******

PRENTICE HALL PRESS
Published by the Penguin Group
Penguin Group (USA) Inc.
375 Hudson Street, New York, New York 10014, USA
Penguin Group (Canada), 90 Eglinton Avenue East, Suite 700, Toronto, Ontario M4P 2Y3, Canada (a division of
Pearson Penguin Canada Inc.)
Penguin Books Ltd., 80 Strand, London WC2R 0RL, England
Penguin Group Ireland, 25 St. Stephen’s Green, Dublin 2, Ireland (a division of Penguin Books Ltd.)
Penguin Group (Australia), 250 Camberwell Road, Camberwell, Victoria 3124, Australia (a division of Pearson
Australia Group Pty. Ltd.)
Penguin Books India Pvt. Ltd., 11 Community Centre, Panchsheel Park, New Delhi—110 017, India
Penguin Group (NZ), 67 Apollo Drive, Rosedale, North Shore 0632, New Zealand (a division of Pearson New Zealand
Ltd.)
Penguin Books (South Africa) (Pty.) Ltd., 24 Sturdee Avenue, Rosebank, Johannesburg 2196, South Africa

Penguin Books Ltd., Registered Offic

In [85]:
original_text=text.getvalue()
original_text



In [86]:
original_text = re.sub(r'\s+', ' ', original_text)
original_text



In [87]:
text = re.sub(r'[^\w\s]', '', original_text)
text



In [88]:
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
print(stopwords)

{'on', 'neither', 'otherwise', 'four', 'once', 'we', 'almost', 'n‘t', 'give', '‘s', '‘ve', 'am', 'serious', 'because', 'several', 'myself', 'around', 'next', 'however', 'nor', 'his', 'she', 'always', 'will', 'they', 'too', 'go', 'former', 'show', 'if', 'hundred', 'whereas', 'none', 'it', 'across', 'here', 'her', 'each', 'thereupon', 'say', 'sixty', 'per', 'well', 'part', 'along', 'him', 'might', 'mine', 'of', 'anyway', 'down', 'out', 'become', "'d", 'without', 'wherever', 'over', 'both', 'no', 'such', 'and', 'whom', 'within', 'anything', 'first', 'again', 'move', 'up', 'for', 'any', 'never', 'themselves', 'under', 'very', 'i', '’ll', 'another', 'anyone', 'between', 'has', 'side', '‘ll', 'only', 'us', 'a', 'them', 'becoming', 'two', 'would', 'herself', 'somehow', 'least', 'in', '’d', 'among', 'there', 'whence', 'indeed', "'ve", 'nothing', 'together', 'eight', 'three', 'towards', 'can', 'cannot', 'with', 'keep', 'else', 'often', 'also', 'became', 'fifty', 'this', 'using', 'or', 'made', '

In [89]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [90]:
formatted_text = preprocess(text)
formatted_text



In [91]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'ebook': 189,
          'converter': 189,
          'demo': 189,
          'watermarks': 189,
          'prentice': 3,
          'hall': 3,
          'press': 5,
          'published': 1,
          'penguin': 15,
          'group': 14,
          'usa': 5,
          'inc': 5,
          '375': 2,
          'hudson': 2,
          'street': 7,
          'new': 85,
          'york': 8,
          '10014': 2,
          'canada': 6,
          '90': 2,
          'eglinton': 1,
          'avenue': 2,
          'east': 3,
          'suite': 1,
          '700': 1,
          'toronto': 1,
          'ontario': 1,
          'm4p': 1,
          '2y3': 1,
          'division': 4,
          'pearson': 3,
          'books': 19,
          'ltd': 7,
          '80': 2,
          'strand': 2,
          'london': 11,
          'wc2r': 2,
          '0rl': 2,
          'england': 7,
          'ireland': 6,
          '25': 8,
          'st': 8,
          'stephens': 1,
          'green': 2,
          '

In [92]:
highest_frequency = max(word_frequency.values())
highest_frequency

453

In [93]:
for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)
  word_frequency

In [94]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

[' ******ebook converter DEMO Watermarks******* ******ebook converter DEMO Watermarks******* PRENTICE HALL PRESS Published by the Penguin Group Penguin Group (USA) Inc. 375 Hudson Street, New York, New York 10014, USA Penguin Group (Canada), 90 Eglinton Avenue East, Suite 700, Toronto, Ontario M4P 2Y3, Canada (a division of Pearson Penguin Canada Inc.) Penguin Books Ltd., 80 Strand, London WC2R 0RL, England Penguin Group Ireland, 25 St. Stephen’s Green, Dublin 2, Ireland (a division of Penguin Books Ltd.) Penguin Group (Australia), 250 Camberwell Road, Camberwell, Victoria 3124, Australia (a division of Pearson Australia Group Pty.',
 'Ltd.) Penguin Books India Pvt.',
 'Ltd., 11 Community Centre, Panchsheel Park, New Delhi—110 017, India Penguin Group (NZ), 67 Apollo Drive, Rosedale, North Shore 0632, New Zealand (a division of Pearson New Zealand Ltd.) Penguin Books (South Africa) (Pty.)',
 'Ltd., 24 Sturdee Avenue, Rosebank, Johannesburg 2196, South Africa Penguin Books Ltd., Registe

In [95]:
sentence_new=[]
for i in range(len(sentence_list)):
    if int(len(sentence_list[i]))<500:
        sentence_new.append(sentence_list[i])
sentence_new        

['Ltd.) Penguin Books India Pvt.',
 'Ltd., 11 Community Centre, Panchsheel Park, New Delhi—110 017, India Penguin Group (NZ), 67 Apollo Drive, Rosedale, North Shore 0632, New Zealand (a division of Pearson New Zealand Ltd.) Penguin Books (South Africa) (Pty.)',
 'Ltd., 24 Sturdee Avenue, Rosebank, Johannesburg 2196, South Africa Penguin Books Ltd., Registered Offices: 80 Strand, London WC2R 0RL, England While the author has made every effort to provide accurate telephone numbers and Internet addresses at the time of publication, neither the publisher nor the author assumes any responsibility for errors, or for changes that occur after publication.',
 'Further, the publisher does not have any control over and does not assume any responsibility for author or third-party websites or their content.',
 'Copyright © 2001 by Penguin Group (USA) Inc. Cover design by Ben Gibson All rights reserved.',
 'No part of this book may be reproduced, scanned, or distributed in any printed or electronic 

In [96]:
len(sentence_new)

4332

In [97]:
score_sentences = {}
for sentence in sentence_new:
  #print(sentence)
  for word in nltk.word_tokenize(sentence.lower()):
    #print(word)
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]
score_sentences

{'Ltd.) Penguin Books India Pvt.': 0.08609271523178808,
 'Ltd., 11 Community Centre, Panchsheel Park, New Delhi—110 017, India Penguin Group (NZ), 67 Apollo Drive, Rosedale, North Shore 0632, New Zealand (a division of Pearson New Zealand Ltd.) Penguin Books (South Africa) (Pty.)': 0.8388520971302431,
 'Ltd., 24 Sturdee Avenue, Rosebank, Johannesburg 2196, South Africa Penguin Books Ltd., Registered Offices: 80 Strand, London WC2R 0RL, England While the author has made every effort to provide accurate telephone numbers and Internet addresses at the time of publication, neither the publisher nor the author assumes any responsibility for errors, or for changes that occur after publication.': 0.5253863134657837,
 'Further, the publisher does not have any control over and does not assume any responsibility for author or third-party websites or their content.': 0.05518763796909492,
 'Copyright © 2001 by Penguin Group (USA) Inc. Cover design by Ben Gibson All rights reserved.': 0.10816777041

In [98]:
best_sentences = heapq.nlargest(int(len(sentence_new) * 0.2), score_sentences, key = score_sentences.get)
best_sentences

['******ebook converter DEMO Watermarks******* *The Power of Your Subconscious Mind by Joseph Murphy, revised and expanded by Ian McMahan.',
 'Reverse it immediately in your mind by affirming, “God is my instant and everlasting supply, and that bill is paid in Divine order.” If a negative thought comes to you 50 times in one hour, reverse it each time by thinking, “God is my instant supply, meeting that need right now.” After a while, the thought of financial lack will lose all momentum and yon will find your subconscious is being conditioned to wealth.',
 'The secret of health, wealth, and outstanding achievement is in developing what is called the “quiet mind.” By taking certain constructive words from the Bible, which represent ******ebook converter DEMO Watermarks******* the eternal truths of God and His Law, your mind becomes anchored on the Supreme Presence, which responds as you call upon it, and you experience the riches of the quiet mind.',
 'Once you gain the ability to tap y

In [99]:
len(best_sentences)

866

In [100]:
print('-----------------------------------------SUMMARY BASED ON IMPORTANCE OF SENTENCE-------------------------------------')
summary = ' '.join(best_sentences)
summary

-----------------------------------------SUMMARY BASED ON IMPORTANCE OF SENTENCE-------------------------------------


'******ebook converter DEMO Watermarks******* *The Power of Your Subconscious Mind by Joseph Murphy, revised and expanded by Ian McMahan. Reverse it immediately in your mind by affirming, “God is my instant and everlasting supply, and that bill is paid in Divine order.” If a negative thought comes to you 50 times in one hour, reverse it each time by thinking, “God is my instant supply, meeting that need right now.” After a while, the thought of financial lack will lose all momentum and yon will find your subconscious is being conditioned to wealth. The secret of health, wealth, and outstanding achievement is in developing what is called the “quiet mind.” By taking certain constructive words from the Bible, which represent ******ebook converter DEMO Watermarks******* the eternal truths of God and His Law, your mind becomes anchored on the Supreme Presence, which responds as you call upon it, and you experience the riches of the quiet mind. Once you gain the ability to tap your subconsci

In [101]:
#clean summary
# Define a function to clean the text
def clean(summary):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', summary) 
    return summary
print(summary)

******ebook converter DEMO Watermarks******* *The Power of Your Subconscious Mind by Joseph Murphy, revised and expanded by Ian McMahan. Reverse it immediately in your mind by affirming, “God is my instant and everlasting supply, and that bill is paid in Divine order.” If a negative thought comes to you 50 times in one hour, reverse it each time by thinking, “God is my instant supply, meeting that need right now.” After a while, the thought of financial lack will lose all momentum and yon will find your subconscious is being conditioned to wealth. The secret of health, wealth, and outstanding achievement is in developing what is called the “quiet mind.” By taking certain constructive words from the Bible, which represent ******ebook converter DEMO Watermarks******* the eternal truths of God and His Law, your mind becomes anchored on the Supreme Presence, which responds as you call upon it, and you experience the riches of the quiet mind. Once you gain the ability to tap your subconscio

In [102]:
# Remove punctuation
summary = re.sub(r'[^\w\s]', '', summary)
summary

'ebook converter DEMO Watermarks The Power of Your Subconscious Mind by Joseph Murphy revised and expanded by Ian McMahan Reverse it immediately in your mind by affirming God is my instant and everlasting supply and that bill is paid in Divine order If a negative thought comes to you 50 times in one hour reverse it each time by thinking God is my instant supply meeting that need right now After a while the thought of financial lack will lose all momentum and yon will find your subconscious is being conditioned to wealth The secret of health wealth and outstanding achievement is in developing what is called the quiet mind By taking certain constructive words from the Bible which represent ebook converter DEMO Watermarks the eternal truths of God and His Law your mind becomes anchored on the Supreme Presence which responds as you call upon it and you experience the riches of the quiet mind Once you gain the ability to tap your subconscious mind you will never want for any good thing all 

In [103]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(summary)
 
filtered_summary = [w for w in word_tokens if not w.lower() in stop_words]
filtered_summary = []
for w in word_tokens:
    if w not in stop_words:
        filtered_summary.append(w)
print(word_tokens)
print(filtered_summary)

['ebook', 'converter', 'DEMO', 'Watermarks', 'The', 'Power', 'of', 'Your', 'Subconscious', 'Mind', 'by', 'Joseph', 'Murphy', 'revised', 'and', 'expanded', 'by', 'Ian', 'McMahan', 'Reverse', 'it', 'immediately', 'in', 'your', 'mind', 'by', 'affirming', 'God', 'is', 'my', 'instant', 'and', 'everlasting', 'supply', 'and', 'that', 'bill', 'is', 'paid', 'in', 'Divine', 'order', 'If', 'a', 'negative', 'thought', 'comes', 'to', 'you', '50', 'times', 'in', 'one', 'hour', 'reverse', 'it', 'each', 'time', 'by', 'thinking', 'God', 'is', 'my', 'instant', 'supply', 'meeting', 'that', 'need', 'right', 'now', 'After', 'a', 'while', 'the', 'thought', 'of', 'financial', 'lack', 'will', 'lose', 'all', 'momentum', 'and', 'yon', 'will', 'find', 'your', 'subconscious', 'is', 'being', 'conditioned', 'to', 'wealth', 'The', 'secret', 'of', 'health', 'wealth', 'and', 'outstanding', 'achievement', 'is', 'in', 'developing', 'what', 'is', 'called', 'the', 'quiet', 'mind', 'By', 'taking', 'certain', 'constructive'

In [104]:
# Removing stopwards
my_stop_words=stopwords.words('english')

sw_list = ['\x92','rt','ye','yeah','haha','Yes','U0001F923','I','The']
my_stop_words.extend(sw_list)

no_stop_tokens=[word for word in filtered_summary if not word in my_stop_words]
print(no_stop_tokens)

['ebook', 'converter', 'DEMO', 'Watermarks', 'Power', 'Your', 'Subconscious', 'Mind', 'Joseph', 'Murphy', 'revised', 'expanded', 'Ian', 'McMahan', 'Reverse', 'immediately', 'mind', 'affirming', 'God', 'instant', 'everlasting', 'supply', 'bill', 'paid', 'Divine', 'order', 'If', 'negative', 'thought', 'comes', '50', 'times', 'one', 'hour', 'reverse', 'time', 'thinking', 'God', 'instant', 'supply', 'meeting', 'need', 'right', 'After', 'thought', 'financial', 'lack', 'lose', 'momentum', 'yon', 'find', 'subconscious', 'conditioned', 'wealth', 'secret', 'health', 'wealth', 'outstanding', 'achievement', 'developing', 'called', 'quiet', 'mind', 'By', 'taking', 'certain', 'constructive', 'words', 'Bible', 'represent', 'ebook', 'converter', 'DEMO', 'Watermarks', 'eternal', 'truths', 'God', 'His', 'Law', 'mind', 'becomes', 'anchored', 'Supreme', 'Presence', 'responds', 'call', 'upon', 'experience', 'riches', 'quiet', 'mind', 'Once', 'gain', 'ability', 'tap', 'subconscious', 'mind', 'never', 'want

In [105]:
lower_words=[Text.lower() for Text in no_stop_tokens]
print(lower_words)

['ebook', 'converter', 'demo', 'watermarks', 'power', 'your', 'subconscious', 'mind', 'joseph', 'murphy', 'revised', 'expanded', 'ian', 'mcmahan', 'reverse', 'immediately', 'mind', 'affirming', 'god', 'instant', 'everlasting', 'supply', 'bill', 'paid', 'divine', 'order', 'if', 'negative', 'thought', 'comes', '50', 'times', 'one', 'hour', 'reverse', 'time', 'thinking', 'god', 'instant', 'supply', 'meeting', 'need', 'right', 'after', 'thought', 'financial', 'lack', 'lose', 'momentum', 'yon', 'find', 'subconscious', 'conditioned', 'wealth', 'secret', 'health', 'wealth', 'outstanding', 'achievement', 'developing', 'called', 'quiet', 'mind', 'by', 'taking', 'certain', 'constructive', 'words', 'bible', 'represent', 'ebook', 'converter', 'demo', 'watermarks', 'eternal', 'truths', 'god', 'his', 'law', 'mind', 'becomes', 'anchored', 'supreme', 'presence', 'responds', 'call', 'upon', 'experience', 'riches', 'quiet', 'mind', 'once', 'gain', 'ability', 'tap', 'subconscious', 'mind', 'never', 'want

In [106]:
ps=PorterStemmer()
stemmed_tokens=[ps.stem(word) for word in lower_words]

print(stemmed_tokens)

['ebook', 'convert', 'demo', 'watermark', 'power', 'your', 'subconsci', 'mind', 'joseph', 'murphi', 'revis', 'expand', 'ian', 'mcmahan', 'revers', 'immedi', 'mind', 'affirm', 'god', 'instant', 'everlast', 'suppli', 'bill', 'paid', 'divin', 'order', 'if', 'neg', 'thought', 'come', '50', 'time', 'one', 'hour', 'revers', 'time', 'think', 'god', 'instant', 'suppli', 'meet', 'need', 'right', 'after', 'thought', 'financi', 'lack', 'lose', 'momentum', 'yon', 'find', 'subconsci', 'condit', 'wealth', 'secret', 'health', 'wealth', 'outstand', 'achiev', 'develop', 'call', 'quiet', 'mind', 'by', 'take', 'certain', 'construct', 'word', 'bibl', 'repres', 'ebook', 'convert', 'demo', 'watermark', 'etern', 'truth', 'god', 'hi', 'law', 'mind', 'becom', 'anchor', 'suprem', 'presenc', 'respond', 'call', 'upon', 'experi', 'rich', 'quiet', 'mind', 'onc', 'gain', 'abil', 'tap', 'subconsci', 'mind', 'never', 'want', 'good', 'thing', 'life', 'whether', 'health', 'peac', 'mind', 'true', 'express', 'companionshi

In [107]:
nlp=spacy.load('en_core_web_sm')
doc=nlp(' '.join(lower_words))
print(doc)

ebook converter demo watermarks power your subconscious mind joseph murphy revised expanded ian mcmahan reverse immediately mind affirming god instant everlasting supply bill paid divine order if negative thought comes 50 times one hour reverse time thinking god instant supply meeting need right after thought financial lack lose momentum yon find subconscious conditioned wealth secret health wealth outstanding achievement developing called quiet mind by taking certain constructive words bible represent ebook converter demo watermarks eternal truths god his law mind becomes anchored supreme presence responds call upon experience riches quiet mind once gain ability tap subconscious mind never want good thing life whether health peace mind true expression companionship lovely home money need want want see also inner peace ebook converter demo watermarks meditation riches 153154 skepticism mind powers 12 skin problems love healing 175176 negative thoughts 5255 59 sleeping inner peace 228 s

In [108]:
lemmas=[token.lemma_ for token in doc]
print(lemmas)

['ebook', 'converter', 'demo', 'watermark', 'power', '-PRON-', 'subconscious', 'mind', 'joseph', 'murphy', 'revise', 'expand', 'ian', 'mcmahan', 'reverse', 'immediately', 'mind', 'affirm', 'god', 'instant', 'everlasting', 'supply', 'bill', 'pay', 'divine', 'order', 'if', 'negative', 'thought', 'come', '50', 'time', 'one', 'hour', 'reverse', 'time', 'think', 'god', 'instant', 'supply', 'meeting', 'need', 'right', 'after', 'think', 'financial', 'lack', 'lose', 'momentum', 'yon', 'find', 'subconscious', 'condition', 'wealth', 'secret', 'health', 'wealth', 'outstanding', 'achievement', 'develop', 'call', 'quiet', 'mind', 'by', 'take', 'certain', 'constructive', 'word', 'bible', 'represent', 'ebook', 'converter', 'demo', 'watermark', 'eternal', 'truth', 'god', '-PRON-', 'law', 'mind', 'become', 'anchor', 'supreme', 'presence', 'respond', 'call', 'upon', 'experience', 'rich', 'quiet', 'mind', 'once', 'gain', 'ability', 'tap', 'subconscious', 'mind', 'never', 'want', 'good', 'thing', 'life', 

In [109]:
clean_sum=' '.join(lemmas)
clean_sum

'ebook converter demo watermark power -PRON- subconscious mind joseph murphy revise expand ian mcmahan reverse immediately mind affirm god instant everlasting supply bill pay divine order if negative thought come 50 time one hour reverse time think god instant supply meeting need right after think financial lack lose momentum yon find subconscious condition wealth secret health wealth outstanding achievement develop call quiet mind by take certain constructive word bible represent ebook converter demo watermark eternal truth god -PRON- law mind become anchor supreme presence respond call upon experience rich quiet mind once gain ability tap subconscious mind never want good thing life whether health peace mind true expression companionship lovely home money nee want want see also inner peace ebook converter demo watermark meditation rich 153154 skepticism mind power 12 skin problem love heal 175176 negative thought 5255 59 sleep inner peace 228 sobriety choose 136137 143 socrates intui

In [110]:
#Sentiment analysis using TextBlob


from textblob import TextBlob

# function to calculate subjectivity 
def getSubjectivity(clean_sum):
    return TextBlob(clean_sum).sentiment.subjectivity

# function to calculate polarity
def getPolarity(clean_sum):
    return TextBlob(clean_sum).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return  '"Negative"'
    elif score == 0:
        return '"Neutral"'
    else:
        return '"Positive"'

In [111]:
subjectivity=getSubjectivity(clean_sum)
polarity=getPolarity(clean_sum)
analysis=analysis(polarity)

print(subjectivity)
print(polarity)

0.5165738219221511
0.22203234126627439


In [112]:
print( "The sentimental analysis using textblob is : ", analysis)

The sentimental analysis using textblob is :  "Positive"


In [113]:
pip install vaderSentiment



In [116]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sentiment_scores(clean_sum):
# Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
# polarity_scores method of SentimentIntensityAnalyzer
# oject gives a sentiment dictionary.
# which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(clean_sum)
    print("Overall sentiment dictionary is : ", sentiment_dict)
    print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
    print("Sentence Overall Rated As", end = " ")
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
    elif sentiment_dict['compound'] <= - 0.05 :
        print("Negative")
    else :
        print("Neutral")
        
sentiment_scores(clean_sum)

Overall sentiment dictionary is :  {'neg': 0.052, 'neu': 0.504, 'pos': 0.444, 'compound': 1.0}
sentence was rated as  5.2 % Negative
sentence was rated as  50.4 % Neutral
sentence was rated as  44.4 % Positive
Sentence Overall Rated As Positive


In [117]:
import pandas as pd

In [118]:
book = pd.read_csv("/content/Think-Yourself-Rich.csv")
book

Unnamed: 0,sentences
0,******ebook converter DEMO Watermarks*******
1,PRENTICE HALL PRESS
2,Published by the Penguin Group
3,Penguin Group (USA) Inc.
4,"375 Hudson Street, New York, New York 10014, USA"
...,...
6015,Visit Penguin.com for more about this author a...
6016,their books.
6017,Discover your next great read!
6018,******ebook converter DEMO Watermarks*******


In [119]:
book.isnull().sum()

sentences    0
dtype: int64

In [120]:
book = book.dropna()
book.head()

Unnamed: 0,sentences
0,******ebook converter DEMO Watermarks*******
1,PRENTICE HALL PRESS
2,Published by the Penguin Group
3,Penguin Group (USA) Inc.
4,"375 Hudson Street, New York, New York 10014, USA"


In [121]:
import re #regular expression
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('[....]', '', text)
    #text = re.sub('\n','',text)
    return text

clean = lambda x: clean_text(x)

In [122]:
book['cleaned_text'] = book.sentences.apply(clean)
book.head()

Unnamed: 0,sentences,cleaned_text
0,******ebook converter DEMO Watermarks*******,ebook converter demo watermarks
1,PRENTICE HALL PRESS,prentice hall press
2,Published by the Penguin Group,published by the penguin group
3,Penguin Group (USA) Inc.,penguin group usa inc
4,"375 Hudson Street, New York, New York 10014, USA",hudson street new york new york usa


In [123]:
afinn = pd.read_csv('/content/Afinn.csv.xls', sep=',', encoding='latin-1')
afinn.head()

Unnamed: 0,word,value
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


In [124]:
affinity_scores = afinn.set_index('word')['value'].to_dict()
affinity_scores

{'abandon': -2,
 'abandoned': -2,
 'abandons': -2,
 'abducted': -2,
 'abduction': -2,
 'abductions': -2,
 'abhor': -3,
 'abhorred': -3,
 'abhorrent': -3,
 'abhors': -3,
 'abilities': 2,
 'ability': 2,
 'aboard': 1,
 'absentee': -1,
 'absentees': -1,
 'absolve': 2,
 'absolved': 2,
 'absolves': 2,
 'absolving': 2,
 'absorbed': 1,
 'abuse': -3,
 'abused': -3,
 'abuses': -3,
 'abusive': -3,
 'accept': 1,
 'accepted': 1,
 'accepting': 1,
 'accepts': 1,
 'accident': -2,
 'accidental': -2,
 'accidentally': -2,
 'accidents': -2,
 'accomplish': 2,
 'accomplished': 2,
 'accomplishes': 2,
 'accusation': -2,
 'accusations': -2,
 'accuse': -2,
 'accused': -2,
 'accuses': -2,
 'accusing': -2,
 'ache': -2,
 'achievable': 1,
 'aching': -2,
 'acquit': 2,
 'acquits': 2,
 'acquitted': 2,
 'acquitting': 2,
 'acrimonious': -3,
 'active': 1,
 'adequate': 1,
 'admire': 3,
 'admired': 3,
 'admires': 3,
 'admiring': 3,
 'admit': -1,
 'admits': -1,
 'admitted': -1,
 'admonish': -2,
 'admonished': -2,
 'adopt': 

In [125]:
#Custom function :score each word in a sentence in lemmatised form, 
#but calculate the score for the whole original sentence.
import spacy
nlp = spacy.load("en_core_web_sm")
sentiment_lexicon = affinity_scores

def calculate_sentiment(text: str = None):
    sent_score = 0
    if text:
        sentence = nlp(text)
        for word in sentence:
            sent_score += sentiment_lexicon.get(word.lemma_, 0)
    return sent_score

In [126]:
book['Sentiment_value'] = book['cleaned_text'].apply(calculate_sentiment)
book

Unnamed: 0,sentences,cleaned_text,Sentiment_value
0,******ebook converter DEMO Watermarks*******,ebook converter demo watermarks,0
1,PRENTICE HALL PRESS,prentice hall press,0
2,Published by the Penguin Group,published by the penguin group,0
3,Penguin Group (USA) Inc.,penguin group usa inc,0
4,"375 Hudson Street, New York, New York 10014, USA",hudson street new york new york usa,0
...,...,...,...
6015,Visit Penguin.com for more about this author a...,visit penguincom for more about this author an...,0
6016,their books.,their books,0
6017,Discover your next great read!,discover your next great read,3
6018,******ebook converter DEMO Watermarks*******,ebook converter demo watermarks,0


In [129]:
# This function will return sentiment category based on values in sentiment_value column

def sent_sentiment_cat(row):
    
    '''This function returns sentiment category based on values in sentiment_value column'''
    
    if row['Sentiment_value'] < 0:
        val = 'Negative'
    elif row['Sentiment_value'] == 0: 
        val = 'Neutral'
    elif row['Sentiment_value'] > 0:
        val = 'Positive'
    else:
        val = -1
    return val

In [130]:
#Applying the function in our new column
book['Sentiment_value'] = book.apply(sent_sentiment_cat, axis=1)
book

Unnamed: 0,sentences,cleaned_text,Sentiment_value
0,******ebook converter DEMO Watermarks*******,ebook converter demo watermarks,Neutral
1,PRENTICE HALL PRESS,prentice hall press,Neutral
2,Published by the Penguin Group,published by the penguin group,Neutral
3,Penguin Group (USA) Inc.,penguin group usa inc,Neutral
4,"375 Hudson Street, New York, New York 10014, USA",hudson street new york new york usa,Neutral
...,...,...,...
6015,Visit Penguin.com for more about this author a...,visit penguincom for more about this author an...,Neutral
6016,their books.,their books,Neutral
6017,Discover your next great read!,discover your next great read,Positive
6018,******ebook converter DEMO Watermarks*******,ebook converter demo watermarks,Neutral


In [132]:
book['Sentiment_value'].value_counts()

Positive    2662
Neutral     2553
Negative     805
Name: Sentiment_value, dtype: int64

In [234]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [236]:
vectorizer = TfidfVectorizer()
array = vectorizer.fit_transform(book['cleaned_text'].values.astype('U')).toarray()

In [237]:
text_df = pd.DataFrame(array,columns= vectorizer.get_feature_names())
text_df

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,your,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.228201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [238]:
text_df['Sentiment_value'] =  book['Sentiment_value']
text_df

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan,Sentiment_value
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
6016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
6017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Positive
6018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral


In [None]:
text_df.shape

In [239]:
text_df=text_df.drop_duplicates()
text_df

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan,Sentiment_value
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Positive
6014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral
6016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neutral


In [241]:
text_df.shape

(5619, 5897)

In [243]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
text_df['Sentiment_value'] = encode.fit_transform(text_df['Sentiment_value'])

In [244]:
text_df

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan,Sentiment_value
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
6014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [245]:
X = text_df.drop(['Sentiment_value'],axis = 1)
y = text_df['Sentiment_value']

In [None]:
from sklearn.model_selection import train_test_split

In [246]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [247]:
X_train

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,your,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan
301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.164283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.154581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [248]:
X_test

Unnamed: 0,abandon,abandoned,abate,abdominal,abhor,abide,abiding,abilities,ability,able,...,your,youre,yours,yourself,yourselves,youth,youve,yucatan,zealand,élan
4318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.277297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [249]:
y_train

301     2
3729    2
4805    2
5340    2
2562    0
       ..
3954    1
5442    1
5479    2
5692    1
912     2
Name: Sentiment_value, Length: 3933, dtype: int64

In [250]:
y_test

4318    2
3223    1
102     2
2060    2
5746    2
       ..
5772    1
2864    1
4649    2
752     2
2574    2
Name: Sentiment_value, Length: 1686, dtype: int64

In [258]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)
predictions_SVM = SVM.predict(X_test)
print("SVM Accuracy Score = ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score =  77.28351126927639
