In [None]:
import pandas as pd
from transformers import pipeline
from nltk.tokenize import sent_tokenize
import nltk

# Download the punkt tokenizer for sentence tokenization
nltk.download('punkt')

# Authenticate and create the PyDrive2 client
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Function to load an Excel file from Google Drive
def load_excel_from_drive(file_id, file_name):
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(file_name)
    df = pd.read_excel(file_name)
    return df

# Initialize the language model pipeline
def initialize_classifier(model_name="facebook/bart-large-mnli"):
    return pipeline("zero-shot-classification", model=model_name)

# Function to classify sentences
def classify_sentence(sentence, classifier, labels, hypothesis_template):
    if not sentence.strip():
        return False
    result = classifier(sentence, labels, hypothesis_template=hypothesis_template)
    return any(result['labels'][i] in labels and result['scores'][i] > 0.5 for i in range(len(labels)))

# Function to extract sentences related to specific labels
def extract_reservation_sentences(text, classifier, labels, hypothesis_template):
    sentences = sent_tokenize(text)
    relevant_sentences = {f"sentence {i+1}": sentence for i, sentence in enumerate(sentences) if classify_sentence(sentence, classifier, labels, hypothesis_template)}
    return relevant_sentences

# Function to save the filtered DataFrame to Google Drive
def save_to_drive(df, file_name):
    df.to_excel(file_name, index=False)
    output_file = drive.CreateFile({'title': file_name})
    output_file.SetContentFile(file_name)
    output_file.Upload()
    print("Filtered data with sentences saved to Google Drive as:", output_file['title'])

# Main execution script
def main():
    file_id = '1J-dfVyQ1jvuM-NMugoBPkWTiuZdSJL7D'
    file_name = 'National_Parks_with_topics_sentiment_test.xlsx'

    # Load the Excel file from Google Drive
    df = load_excel_from_drive(file_id, file_name)

    # Initialize the language model pipeline
    classifier = initialize_classifier()

    # Define labels and hypothesis template
    labels = ["park entry reservation"]
    hypothesis_template = "This text is about {}."

    # Create a new column with sentences related to ticket or entrance reservation
    df['reservation_sentences'] = df['content'].apply(lambda x: extract_reservation_sentences(str(x), classifier, labels, hypothesis_template))

    # Filter the DataFrame to keep only rows with relevant sentences
    df_filtered = df[df['reservation_sentences'].apply(len) > 0]

    # Save the filtered DataFrame to a new Excel file and upload it to Google Drive
    output_path = 'filtered_reviews_reservation_with_sentences.xlsx'
    save_to_drive(df_filtered, output_path)

# Execute the main function
if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filtered data with sentences saved to Google Drive as: v5_filtered_reviews_reservation_with_sentences.xlsx


In [1]:
import pandas as pd
from transformers import pipeline
from nltk.tokenize import sent_tokenize
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

# Authenticate and create the PyDrive2 client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Function to load an Excel file from Google Drive
def load_excel_from_drive(file_id, file_name):
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(file_name)
    df = pd.read_excel(file_name)
    return df

# Function to save the processed DataFrame to Google Drive
def save_to_drive(df, file_name):
    df.to_excel(file_name, index=False)
    output_file = drive.CreateFile({'title': file_name})
    output_file.SetContentFile(file_name)
    output_file.Upload()
    print("Processed data saved to Google Drive as:", output_file['title'])

# Initialize the sentiment analysis pipeline
pipe = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Function to calculate sentiment score for a sentence
def calculate_sentiment(sentence):
    result = pipe(sentence)
    sentiment_score = int(result[0]['label'].split()[0])
    return sentiment_score

# Process each entry in the "overcrowding_sentences" column
def process_overcrowding_sentences(overcrowding_sentences):
    sentiment_sentences = {}
    for key, sentence in overcrowding_sentences.items():
        sentiment_score = calculate_sentiment(sentence)
        sentiment_sentences[key] = {'sentence': sentence, 'sentiment': sentiment_score}
    return sentiment_sentences

# Main execution script
def main():
    file_id = ''
    file_name = 'filtered_reviews_reservation_with_sentences.xlsx'

    # Load the Excel file from Google Drive
    df = load_excel_from_drive(file_id, file_name)

    # Apply sentiment analysis to the "overcrowding_sentences" column
    df['sentiment_sentences'] = df['reservation_sentences'].apply(
        lambda x: process_overcrowding_sentences(eval(x)) if pd.notnull(x) else None
    )

    # Save the processed DataFrame to a new Excel file and upload it to Google Drive
    output_path = 'reservation_sentiment_sentences.xlsx'
    save_to_drive(df, output_path)

# Execute the main function
if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Processed data saved to Google Drive as: reservation_sentiment_sentences.xlsx


In [None]:
import pandas as pd
import numpy as np

# Load the existing Excel file with sentiment data
file_name = 'overcrowding_sentiment_sentences.xlsx'
df = pd.read_excel(file_name)

# Calculate the overall mean sentiment score using the 'overall_sentiment' column
if 'overall_sentiment' in df.columns:
    overall_mean_sentiment = df['overall_sentiment'].mean()
    print(f"\nOverall Mean Sentiment Score for Entire Content: {overall_mean_sentiment}")
else:
    print("\n'overall_sentiment' column not found in the data.")

# Calculate the overall mean sentiment score for topic-specific sentences using the 'sentiment_sentences' data
if 'sentiment_sentences' in df.columns:
    topic_sentiment_scores = []

    for sentiment_data in df['sentiment_sentences'].dropna():
        sentiment_dict = eval(sentiment_data)  # Convert string representation of dictionary back to dictionary
        for sentence_data in sentiment_dict.values():
            topic_sentiment_scores.append(sentence_data['sentiment'])

    if topic_sentiment_scores:
        topic_mean_sentiment = np.mean(topic_sentiment_scores)
        print(f"Overall Mean Sentiment Score for Topic-Specific Sentences: {topic_mean_sentiment}")
    else:
        print("No sentiment scores found in 'sentiment_sentences' column.")
else:
    print("\n'sentiment_sentences' column not found in the data.")


Overall Mean Sentiment Score for Entire Content: 3.95
Overall Mean Sentiment Score for Topic-Specific Sentences: 3.032608695652174


In [2]:
import pandas as pd
import numpy as np

# Load the existing Excel file with sentiment data
file_name = 'reservation_sentiment_sentences.xlsx'
df = pd.read_excel(file_name)

# Calculate the overall mean sentiment score using the 'overall_sentiment' column
if 'overall_sentiment' in df.columns:
    overall_mean_sentiment = df['overall_sentiment'].mean()
    print(f"\nOverall Mean Sentiment Score for Entire Content: {overall_mean_sentiment}")
else:
    print("\n'overall_sentiment' column not found in the data.")

# Calculate the overall mean sentiment score for topic-specific sentences using the 'sentiment_sentences' data
if 'sentiment_sentences' in df.columns:
    topic_sentiment_scores = []

    for sentiment_data in df['sentiment_sentences'].dropna():
        sentiment_dict = eval(sentiment_data)  # Convert string representation of dictionary back to dictionary
        for sentence_data in sentiment_dict.values():
            topic_sentiment_scores.append(sentence_data['sentiment'])

    if topic_sentiment_scores:
        topic_mean_sentiment = np.mean(topic_sentiment_scores)
        print(f"Overall Mean Sentiment Score for Topic-Specific Sentences: {topic_mean_sentiment}")
    else:
        print("No sentiment scores found in 'sentiment_sentences' column.")
else:
    print("\n'sentiment_sentences' column not found in the data.")


Overall Mean Sentiment Score for Entire Content: 4.190476190476191
Overall Mean Sentiment Score for Topic-Specific Sentences: 3.617021276595745
