In [5]:
import boto3
import logging
import pandas as pd
from botocore.exceptions import ClientError
from io import StringIO
import matplotlib.pyplot as plt
import json

# Set up logging
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)

class S3Handler:
    """Handles interactions with S3."""

    def __init__(self, bucket_name):
        self.s3 = boto3.client('s3')
        self.bucket_name = bucket_name

    def load_csv_from_s3(self, key):
        obj = self.s3.get_object(Bucket=self.bucket_name, Key=key)
        data = pd.read_csv(obj['Body'])
        return data

    def save_csv_to_s3(self, data, key):
        csv_buffer = StringIO()
        data.to_csv(csv_buffer, index=False)
        self.s3.put_object(Bucket=self.bucket_name, Key=key, Body=csv_buffer.getvalue())

class SentimentAnalyzer:
    """Handles sentiment analysis using Amazon Comprehend."""

    def __init__(self, region_name='us-west-2'):
        self.comprehend = boto3.client('comprehend', region_name=region_name)

    def analyze_sentiment(self, text):
        return self.comprehend.detect_sentiment(Text=text, LanguageCode='es')['Sentiment']

    def add_sentiment_to_data(self, data):
        data['Sentiment'] = [self.analyze_sentiment(text) for text in data['Tweet Text']]
        return data

    def get_most_common_sentiment(self, data):
        most_common_sentiment = data['Sentiment'].mode()[0]
        return most_common_sentiment

    def plot_sentiment_distribution(self, data, output_path):
        sentiment_counts = data['Sentiment'].value_counts()
        plt.figure(figsize=(10, 6))
        sentiment_counts.plot(kind='bar')
        plt.title('Sentiment Distribution')
        plt.xlabel('Sentiment')
        plt.ylabel('Count')
        plt.savefig(output_path)
        plt.close()

class BedrockAnalyzer:
    """Handles analysis using Amazon Bedrock."""

    def __init__(self, model_id):
        self.brt = boto3.client("bedrock-runtime")
        self.model_id = model_id

    def analyze_tweets(self, combined_text):
        user_message = f"Provide a general impression of what people are saying in these tweets: '{combined_text}'"
        conversation = [
            {
                "role": "user",
                "content": [{"text": user_message}],
            }
        ]
        response = self.brt.converse(
            modelId=self.model_id,
            messages=conversation,
            inferenceConfig={"maxTokens": 512, "temperature": 0.5, "topP": 0.9},
        )
        return response["output"]["message"]["content"][0]["text"]

class SnsWrapper:
    """Encapsulates Amazon SNS topic and subscription functions."""

    def __init__(self, sns_resource):
        self.sns_resource = sns_resource

    def publish_message(self, topic, subject, message):
        try:
            response = topic.publish(
                Subject=subject,
                Message=message
            )
            message_id = response['MessageId']
            logger.info("Published message with ID: %s", message_id)
            return message_id
        except ClientError:
            logger.exception("Couldn't publish message to topic %s.", topic.arn)
            raise

def main():
    # Initialize services and clients
    s3_bucket = 'un-datathon-2024-sisifos'
    sns_topic_arn = "arn:aws:sns:us-west-2:080532742200:TweetSentimentAlert"
    bedrock_model_id = "anthropic.claude-3-5-sonnet-20241022-v2:0"

    s3_handler = S3Handler(s3_bucket)
    sentiment_analyzer = SentimentAnalyzer()
    bedrock_analyzer = BedrockAnalyzer(bedrock_model_id)
    sns_wrapper = SnsWrapper(boto3.resource("sns"))

    # List of CSV files to analyze
    csv_files = ['Sentiment_Analysis/tweets_turism1.csv', 'Sentiment_Analysis/tweets_turism2.csv', 'Sentiment_Analysis/tweets_turism3.csv']
    
    # Prepare to collect results across all files
    all_data = pd.DataFrame()
    all_tweets_text = ""

    for csv_file in csv_files:
        # Step 1: Load data from each CSV file in S3
        data = s3_handler.load_csv_from_s3(csv_file)

        # Step 2: Perform sentiment analysis on each tweet and add it to the data
        data = sentiment_analyzer.add_sentiment_to_data(data)

        # Append to overall data and combined text
        all_data = pd.concat([all_data, data], ignore_index=True)
        all_tweets_text += " " + " ".join(data["Tweet Text"].tolist())

        # Save updated data back to S3 for each file
        output_key = f'Sentiment_Analysis/output/{csv_file.split(".")[0]}_with_sentiment.csv'
        s3_handler.save_csv_to_s3(data, output_key)

    # Step 3: Determine the most common sentiment across all files
    most_common_sentiment = sentiment_analyzer.get_most_common_sentiment(all_data)
    print("Most Common Sentiment:", most_common_sentiment)

    # Step 4: Generate and save overall sentiment distribution plot
    sentiment_distribution_path = '/tmp/sentiment_distribution.png'
    sentiment_analyzer.plot_sentiment_distribution(all_data, sentiment_distribution_path)
    s3_handler.s3.upload_file(sentiment_distribution_path, s3_bucket, 'output/sentiment_distribution.png')

    # Step 5: Use Bedrock for a general sentiment impression based on all tweets
    general_impression = bedrock_analyzer.analyze_tweets(all_tweets_text)
    print("General Impression:", general_impression)

    # Step 6: Send SNS alert with the most common sentiment and general impression
    topic = sns_wrapper.sns_resource.Topic(sns_topic_arn)
    subject = "Tweet Sentiment Alert"
    message = (
        f"Most Common Sentiment: {most_common_sentiment}\n\n"
        f"General Impression: {general_impression}\n\n"
        f"See the sentiment distribution plot and CSV files with detailed analysis in the S3 bucket '{s3_bucket}' under 'output/'."
    )
    message_id = sns_wrapper.publish_message(topic, subject, message)
    print(f"Message sent with ID: {message_id}")

if __name__ == "__main__":
    main()


Most Common Sentiment: NEGATIVE


INFO:root:Published message with ID: 4b86fd70-f703-5b5e-9639-a2dce2b3856c


General Impression: These tweets largely discuss concerns about sex tourism and exploitation in Medellín, Colombia. The main themes include:

1. Widespread criticism of sex tourism and child exploitation in Medellín, with many expressing concern about the city becoming known internationally for this problem

2. Discussion about how certain music and culture (particularly reggaeton) may normalize or promote problematic tourism and exploitation of minors

3. Calls for authorities to take stronger action against sex tourism and child exploitation

4. Some more positive discussions about legitimate tourism initiatives, including:
- Sports tourism
- Medical tourism
- Digital nomad opportunities 
- Sustainable tourism development

5. Debate about the city's image and identity, with some wanting to move away from associations with drugs and sex tourism toward more positive forms of tourism

The overall tone is one of concern about exploitative tourism practices while also recognizing the impo