In [4]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Install NLTK and download the VADER sentiment analysis model
# Uncomment the next two lines if you haven't installed NLTK and downloaded the VADER model yet
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
from nltk import sent_tokenize

# Function to extract sentences and sentiments from an HTML file
def extract_sentences_and_sentiments(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        sentences = sent_tokenize(soup.get_text())
        sentiments = [sid.polarity_scores(sentence)['compound'] for sentence in sentences]
        return sentences, sentiments

# Function to compare sentences and group them based on sentiment
def group_sentences(sentences_list):
    groups = {}
    group_id = 1

    for idx, sentence in enumerate(sentences_list):
        if idx == 0 or abs(sentences_list[idx - 1] - sentence) > 0.1:
            group_id += 1
        groups[idx] = group_id

    return groups

# Path to the directory containing HTML files
directory_path = './'

# Initialize Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Initialize DataFrame
df = pd.DataFrame(columns=['File_Name', 'Sentences', 'Group_ID'])

# Loop through HTML files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".html"):
        file_path = os.path.join(directory_path, filename)
        sentences, sentiments = extract_sentences_and_sentiments(file_path)
        groups = group_sentences(sentiments)
        file_groups = pd.DataFrame({'File_Name': [filename] * len(sentences),
                                    'Sentences': sentences,
                                    'Group_ID': [groups[idx] for idx in range(len(sentences))]})
        df = pd.concat([df, file_groups], ignore_index=True)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
df.to_csv("./results.csv")