In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import stanza

In [None]:
stanza.download('en')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
Beatles_Paragraph = "Sgt. Pepper’s applied the concept of the symphony to rock and roll, adding an incredible (and soon overused) dimension to rock and roll. Nothing could have been more ambitious than the current release: The Beatles is the history and synthesis of Western music. And that, of course is what rock and roll is, and that is what the Beatles are. Rock and roll, the first successful art form of the McLuhan age, is a series of increasing hybrids of musical styles, starting from its basic hybrid of country and western music and black American music (blues, if you will). That merger represents the distantly effected marriage of the music of England and Africa, a yin and yang that could be infinitely extended."

In [None]:
beatles_p1 = nlp(Beatles_Paragraph)

In [None]:
def print_doc_info(doc):
    print(f"Num sentences:\t{len(doc.sentences)}")
    print(f"Num tokens:\t{doc.num_tokens}")
    print(f"Num words:\t{doc.num_words}")
    print(f"Num entities:\t{len(doc.entities)}")
    
def print_sentence_info(sentence):
    print(f"Text: {sentence.text}")
    print(f"Num tokens:\t{len(sentence.tokens)}")
    print(f"Num words:\t{len(sentence.words)}")
    print(f"Num entities:\t{len(sentence.entities)}")

def print_token_info(token):
    print(f"Text:\t{token.text}")
    print(f"Start:\t{token.start_char}")
    print(f"End:\t{token.end_char}")
    
def print_word_info(word):
    print(f"Text:\t{word.text}")
    print(f"Lemma: \t{word.lemma}")
    print(f"UPOS: \t{word.upos}")
    print(f"XPOS: \t{word.xpos}")

In [None]:
# print_doc_info(beatles_p1)
# print_sentence_info(beatles_p1.sentences[0])
# print_token_info(beatles_p1.sentences[0].tokens[2])
print_word_info(beatles_p1.sentences[1].words[1])

In [None]:
def word_info_df(doc):
    """
    - Parameters: doc (a Stanza Document object)
    - Returns: A Pandas DataFrame object with one row for each token in
      doc, and columns for text, lemma, upos, and xpos.
    """
    rows = []
    for sentence in doc.sentences:
        for word in sentence.words:
            row = {
                "text": word.text,
                "lemma": word.lemma,
                "upos": word.upos,
                "xpos": word.xpos,
            }
            rows.append(row)
    return pd.DataFrame(rows)

word_info_df(beatles_p1)

In [None]:
def print_entity_info(entity):
    print(f"Text:\t{entity.text}")
    print(f"Type:\t{entity.type}")
    print(f"Start:\t{entity.start_char}")
    print(f"End:\t{entity.end_char}")

In [None]:
print_entity_info(beatles_p1.entities[0])


In [None]:
def sentiment_descriptor(sentence):
    """
    - Parameters: sentence (a Stanza Sentence object)
    - Returns: A string descriptor for the sentiment value of sentence.
    """
    sentiment_value = sentence.sentiment
    if (sentiment_value == 0):
        return "negative"
    elif (sentiment_value == 1):
        return "neutral"
    else:
        return "positive"

print(sentiment_descriptor(beatles_p1.sentences[0]))

# neutral

def sentence_sentiment_df(doc):
    """
    - Parameters: doc (a Stanza Document object)
    - Returns: A Pandas DataFrame with one row for each sentence in doc,
      and columns for the sentence text and sentiment descriptor.
    """
    rows = []
    for sentence in doc.sentences:
        row = {
            "text": sentence.text,
            "sentiment": sentiment_descriptor(sentence)
        }
        rows.append(row)
    return pd.DataFrame(rows)



In [None]:
sentence_sentiment_df(beatles_p1)


In [None]:
def load_text_doc(file_path):
    with open(file_path) as f:
        txt = f.read()
    return txt

Beatles_PATH = "/Users/hamza.ahmed/Python Apps/ML Projects/beatles.txt"
Beatles_text = load_text_doc(Beatles_PATH)
Beatles = nlp(Beatles_text)

In [None]:
def select_person_entities(doc):
    return [ent for ent in doc.entities if ent.type == "PERSON"]

In [None]:
def person_df(doc):
    """
    - Parameters: doc (a Stanza Document object)
    - Returns: A Pandas DataFrame with one row for each entity in doc
      that has a "PERSON" type, and and columns text, type, start_char, 
      and the sentiment of the sentence in which the entity appears.
    """
    rows = []
    persons = select_person_entities(doc)
    for person in persons:
        row = {
            "text": person.text,
            "type": person.type,
            "start_char": person.start_char,
            "end_char": person.end_char,
            "sentence_sentiment": sentiment_descriptor(person._sent)
        }
        rows.append(row)
    return pd.DataFrame(rows)


In [None]:

characters = person_df(Beatles)
display(characters.head())

In [None]:
def num_unique_items(df, col):
    return len(df[col].unique())

num_unique_items(characters, "text")

In [None]:
def frequency_count(df, col, limit=10):
    return df[col].value_counts().head(limit)

frequency_count(characters, "text")

In [None]:
def sentiment_descriptor_to_val(descriptor):
    """
    - Parameters: descriptor ("negative", "neutral", or "positive")
    - Returns: -1 for "negative", 0 for "neutral", 1 for "positive"
    """
    if descriptor == "negative":
        return -1
    elif descriptor == "neutral":
        return 0
    else:
        return 1

def character_sentiment(df):
    """
    - Parameters: df (Pandas DataFrame)
    - df must contain "text" and "sentiment_descriptor" columns.
    - Returns: 
    """
    sentiment = df.copy()
    sentiment["sentence_sentiment"] = [
        sentiment_descriptor_to_val(s) for s
        in sentiment["sentence_sentiment"]
    ]
    sentiment = sentiment[["text", "sentence_sentiment"]]
    sentiment = sentiment.groupby("text").sum().reset_index()
    
    return sentiment.sort_values("sentence_sentiment")

sentiment_df = character_sentiment(characters)

print("Characters in the most negative settings.")
display(sentiment_df.head(5))

print("Characters in the most positive settings")
display(sentiment_df.tail(5))