# This notebook will generate the data we will use for the NLP models

* The following code will sparse the text files into individual sentences, where each row will be a sentence

* Additionally, a `polarity score` from `vader` and `spaCy` will be assigned to each sentence

* The data will then be ready to **labelling**, I will manually label a score from 1-3 (negative, neutral, positive) for each sentence

In [1]:
import spacy

from spacytextblob.spacytextblob import SpacyTextBlob
import pandas as pd
import numpy as np
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')
from pathlib import Path

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [2]:
file_path = Path.cwd().joinpath("mpr_text")

In [3]:
def read_process_text(file_name):
    # Read in MPR text file
    file = open(file_path.joinpath(file_name), "r")
    # Remove all newline characters
    clean_text = file.read().replace('\n', '')
    # Use spaCy to do a bunch of text stuff
    doc = nlp(clean_text)
    file.close() # close the file
    return doc

def create_text_dataframe(doc):
    sentences = []
    polarities = []
    for sent in doc.sents:
        sentences.append(sent.text)
        polarities.append(sent._.blob.polarity)
    
    df = pd.DataFrame(
        {'text': sentences, 
         'class': [None]*len(sentences), 
         'textblob_polarity': polarities
        }
    )
    
    vader_polarities = []
    
    for sent in df.text:
        vader_polarities.append(sia.polarity_scores(sent)['compound'])
    
    df['vader_polarity'] = vader_polarities
    return df

In [4]:
#### Create the text data files ####

# for date in pd.date_range(start="1995-05-15", end="2023-04-12"):
#     try:
#         doc = read_process_text(f"mpr-{date.strftime('%Y-%m-%d')}.txt")
#         df = create_text_dataframe(doc)
#         df.to_excel(Path.cwd().joinpath("Data", "mpr_data", f"MPR-{date.strftime('%Y-%m-%d')}-unlabeled.xlsx"), index=False)
#         print("Success:", date.strftime('%Y-%m-%d'))
#     except:
#         pass

## Testing 

**Can ignore this stuff...**

In [5]:
# # Read in MPR text file
# f = open(file_path.joinpath("mpr-2023-04-12.txt"), "r")
# # Remove newline spacing
# clean_txt = f.read().replace('\n', '')
# doc = nlp(clean_txt) # send it to spaCy to process

# f.close() # close MPR text file

# sentence_list = []

# for sent in doc.sents:
#     sentence_list.append(sent.text)
    
# pd.Series(sentence_list).to_excel("test.xlsx", sheet_name="sheet1")