In [None]:
## for first script
!pip install pypdf2

## for second script
# installing the spanish model by spacy
# !python -m spacy download es_core_news_md
!spacy download es_core_news_md


In [None]:
# This script will extract the data from the pdf

import re
import PyPDF2

def extract_text() :
    # Reading the PDF using PyPDF2
    reader = PyPDF2.PdfFileReader("./data/main_data.pdf")
    text = ""

    # This will extract the text from all the pages (21 to 401)
    # and will also replace new line with no-space
    for i in range(21, 401) :
        page_text = reader.getPage(i).extractText()
        text += page_text.replace("\n", "")

    # This will remove all extra white spaces and for this,
    # we are making use of REGEX i.e. " +", this means 1 or
    # more than 1 space
    text = re.sub(" +", " ", text)

    # Saving the extracted text into a .txt file
    with open("./files/transcript_clean.txt", "w", encoding="utf-8") as temp_file :
        temp_file.write(text)

if __name__ == "__main__" :
    extract_text()



In [None]:
import csv
import spacy

def loadModel() :
  # Now we will define our corpus (a term used for resource consisting large and structured set of text)
  # by reading the clean file, which we created earlier by extracting text
  text_corpus = open("./files/transcript_clean.txt", "r", encoding="utf-8").read()
  nlp = spacy.load('es_core_news_md') # Loading the spanish model file which we downloaded

  # Now, maximum length of this loaded spacy file is less than our clean file, so, we'll increase its max
  # length and will do it equal to our corpus's length
  # Spacy has a max_length limit of 1,000,000 characters. (via. StackOverflow)
  nlp.max_length = len(text_corpus)

  # Now we will create a document (Term generally used for NLP processed text) by passing our corpus through 
  # our NLP pipeline so that we can process on that data
  doc = nlp(text_corpus)

  getTokens(doc) # Extracting and segmenting the tokens (words, punctuations etc.) from the text
  countPosNeg(doc) # Counting the positive and negative words per sentence

def getTokens(doc) :
  # This will be our list of tokens i.e. we will extract all the tokens and append it to this list
  # All these "attributes" (text, text_lower, lemma, part_of_speech etc.) are provided by spaCy library, which
  # are used to classify the tokens as given
  # Here, we have appended a list of strings, this will be used to classify each token's attribute, once it is
  # written in the CSV file
  tokenList = [["text", "text_lower", "lemma", "lemma_lower", "part_of_speech", "is_alphabet", "is_stopword"]]

  # Appending each token (its attributes) in the list of tokens
  # Here, _ after the attributes are used for unicode type instead of int
  for token in doc :
    tokenList.append([token.text, token.lower_, token.lemma_, token.lemma_.lower(), token.pos_, token.is_alpha, token.is_stop])

  # Finally we will put the tokens inside a file in CSV format
  with open("./files/tokens.csv", "w", encoding="utf-8", newline="") as tokensFile :
    csv.writer(tokensFile).writerows(tokenList)

def countPosNeg(doc) :
  # This function will count the positive and negative words per sentence by using the DATASET from KAGGLE
  # The DATASET negative_words_es.txt contains all the negative (sentiment) words
  # The DATASET positive_words_es.txt contains all the positive (sentiment) words

  # Reading the words from both files and converting them into a list of words
  with open("./data/positive_words_es.txt", "r", encoding="utf-8") as posWordsFile:
    positive_words = posWordsFile.read().splitlines()

  with open("./data/negative_words_es.txt", "r", encoding="utf-8") as negWordsFile:
    negative_words = negWordsFile.read().splitlines()

  # This list will contain the words with their specific scores classifying them into positive and negative words
  scoreList = [["text", "score"]]

  # Iterating over each sentence in our corpus
  for sentence in doc.sents :
    # Technically a sentence is a combination of many words that's why we will only consider it as a sentence if and only if 
    # its length is greater than 10
    if len(sentence.text) > 10 :
      score = 0
      # For each sentence we will match each word with the list of negative and positive words that we have 
      # and will mark the score for each sentence
      for word in sentence :
        if word.lower_ in positive_words:
          score += 1

        if word.lower_ in negative_words:
          score -= 1

    scoreList.append([sentence.text, score])

  # Finally we will store all the scores one by one in a CSV file
  with open("./files/sentences.csv", "w", encoding="utf-8", newline="") as sentencesFile:
    csv.writer(sentencesFile).writerows(scoreList)

if __name__ == "__main__" :
    loadModel()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def getWordsCount(tokensDF) :
  # In the given dataframe of tokens, "programas" (programs) lemma is not correct, which is resulting in
  # incorrect output so, we will group together, programas and programar
  # WORKING : first of all it will get the lemma_lower token and for all its values in the dataframe, it 
  # will check if that is equal to "programa" and after that, it'll go only in the lemma_lower label and
  # change all instances of "programa" to "programar"
  tokensDF.loc[tokensDF['lemma_lower'] == "programa", "lemma_lower"] = "programar"
  
  words = tokensDF[tokensDF["is_alphabet"] == True]["text_lower"].count()
  print("Words : ", words)

  unique_words = tokensDF[tokensDF["is_alphabet"] == True]["lemma_lower"].nunique()
  print("Unique words : ", unique_words)

def plotWordCount(tokensDF) :
  # We will only plot the graph by using the top 20 words which are not stop words and also they must be
  # greater than 1 character
  # Changing all the instances of programa with programar
  tokensDF.loc[tokensDF['lemma_lower'] == "programa", "lemma_lower"] = "programar"

  # First of all we will check all the conditions i.e. token must be alphabet, it must not be a stopword
  # and it must be of length > 1
  # Then, we will count all those words and will only take first 20 occurences of these type of words
  words = tokensDF[(tokensDF["is_alphabet"] == True) & 
                   (tokensDF["is_stopword"] == False) & 
                   (tokensDF["lemma_lower"].str.len() > 1)]["lemma_lower"].value_counts()[:20]

  # Finally plotting the graph using sns i.e. Seaborn library object and matplotlib
  # We are using plt.subplots(size) which will return a tuple containing the figure itself and axes object
  # which we will assign to the variables fig and ax
  # Setting the color of the text of whole graph
  plt.rcParams["text.color"] = "black"
  plt.rcParams["axes.labelcolor"] = "black"
  plt.rcParams["xtick.color"] = "black"
  plt.rcParams["ytick.color"] = "black"
  fig, ax = plt.subplots(figsize=(11,7))
  sns.barplot(x=words.values, y=words.index, ax=ax, palette="gist_heat_d", linewidth=0)
  # gridlines OFF
  ax.grid(False)
  # Setting the limit of x-axis as 0 -- 900
  ax.set_xlim(0, 900)
  # Here, we are setting the ticks (i.e. values on the x-axis)
  ax.set_xticks(range(0, 901, 100))
  # Setting the background color
  ax.set_facecolor("#fbddc8")
  # Setting the border color of the box
  ax.spines["bottom"].set_color("black")
  ax.spines["left"].set_color("black")
  ax.spines["right"].set_color("black")
  ax.spines["top"].set_color("black")
  # Label on the x-axis
  plt.xlabel("Occurrences Count")
  plt.ylabel("Words")
  # Title of the graph
  plt.title("Most Frequent Words")
  # Saving the plotted graph image
  plt.savefig("./images/words_count.png", facecolor="#fbddc8")
  plt.close()

def plotSentiment(sentencesDF) :
  # We are plotting the graph for the scores between -10 to 10
  sentencesDF = sentencesDF[(sentencesDF["score"] >= -10) & (sentencesDF["score"] <= 10)]
  # Setting the color of the text of whole graph
  plt.rcParams["text.color"] = "white"
  plt.rcParams["axes.labelcolor"] = "white"
  plt.rcParams["xtick.color"] = "white"
  plt.rcParams["ytick.color"] = "white"
  # Setting the size of the graph and extracting the values of fig and axes
  fig, ax = plt.subplots(figsize=(11,7))
  # Arranging the labels for y-axis
  yLabels = [str(i) for i in range(-12, 12, 2)]
  plt.yticks(np.arange(-12, 12, 2), yLabels)
  # Scores below 0 will be colored with RED and above 0 colored with GREEN for a better understanding and
  # making it distinguishable to get the idea of positive and negative words
  # For doing this, first we will have to make an array, and for this we will take help of numpy
  # Each value in the array is a 3-tuple representing RGB
  # And the length of the array will be equal to the length of our DF
  colors = np.array(["#fd0054"] * len(sentencesDF["score"]))
  colors[sentencesDF["score"] >= 0] = ["#4ef037"] # Changing the color of the scores >= 0 (i.e. +ve)
  # Plotting the bars
  plt.bar(sentencesDF.index, sentencesDF["score"], color = colors, linewidth = 0)
  # grids OFF
  ax.grid(False)
  # Setting the background color
  ax.set_facecolor("#222831")
  # Setting the border color of the box
  ax.spines["bottom"].set_color("white")
  ax.spines["left"].set_color("white")
  ax.spines["right"].set_color("white")
  ax.spines["top"].set_color("white")
  # Labelling x-axis, y-axis and title
  plt.xlabel("Sentence Number")
  plt.ylabel("Score")
  plt.title("Sentiment Analysis (Positive and Negative)")
  # Saving the image
  plt.savefig("./images/sentiment.png", facecolor="#222831")
  plt.close()

def overallSentiment(sentencesDF) :
  # Considering only the scores between -10 to 10
  sentencesDF = sentencesDF[(sentencesDF["score"] >= -10) & (sentencesDF["score"] <= 10)]
  # Finding the total score (of sentiment)
  totalScore = sentencesDF["score"].sum()
  print("Total (sum) score of the sentiment :", totalScore, end="\n\n")
  # Sentences with neutral sentiment
  neutralCount = sentencesDF[sentencesDF["score"] == 0]["score"].count()
  # Sentences with positive sentiment
  positiveCount = sentencesDF[(sentencesDF["score"] > 0) & (sentencesDF["score"] <= 10)]["score"].count()
  # Sentences with negative sentiment
  negativeCount = sentencesDF[(sentencesDF["score"] >= -10) & (sentencesDF["score"] < 0)]["score"].count()
  print("Neutral count : {}\nPositive count : {}\nNegative count : {}\n".format(neutralCount, positiveCount, negativeCount))

  if negativeCount > positiveCount :
    if negativeCount > neutralCount :
      print("Overall Sentiment of PDF is : Negative")
    else :
      print("Overall Sentiment of PDF is : Neutral")
  elif positiveCount > negativeCount :
    if positiveCount > neutralCount :
      print("Overall sentiment of PDF is : Positive")
    else :
      print ("Overall sentiment of PDF is : Neutral")
  else :
    print("Overall sentiment of PDF is : Neutral")

if __name__ == "__main__" :
  # First of all we will import all the data from CSV files i.e. tokens.csv and sentences.csv
  # by using pandas library and will create its dataframe
  tokensDF = pd.read_csv("./files/tokens.csv")
  sentencesDF = pd.read_csv("./files/sentences.csv")

  ## getWordsCount(tokensDF) # Just for finding the total count of words and unique words
  plotWordCount(tokensDF) # Plotting the graph of occurrences of most used words
  plotSentiment(sentencesDF) # Plotting the sentiment analysis of the whole PDF which was done by calculating number of positive and negative words per sentence
  overallSentiment(sentencesDF) # Finding the total sentiment of the whole PDF

Total (sum) score of the sentiment : -3929

Neutral count : 1330
Positive count : 944
Negative count : 2604

Overall Sentiment of PDF is : Negative
