## Libraries Setup

In [6]:
!pip install -r requirements.txt -q

  pid, fd = os.forkpty()


In [7]:
# Datahandling
import requests
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import numpy as np
import pandas as pd
import seaborn as sns

# Tweet Themes
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import matplotlib.pyplot as plt
import plotly.express as px

# Network analysis
from typing import List, Optional
import json
import matplotlib.patches as mpatches
import networkx as nx
from community import community_louvain
import plotly.graph_objects as go
import random
from collections import defaultdict
import nbformat

# Model prediction
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import classification_report
import tiktoken

# Gradio deployment
import gradio as gr


## Data import

In [10]:
# data sampling? 20000
data = pd.read_csv('predictions_output.csv')
# data = data.sample(n=2000, random_state=42)

In [12]:
# Split the data based on the `label` column
data_label_0 = data[data['Label'] == 0]  # Subset where label=0
data_label_1 = data[data['Label'] == 1]  # Subset where label=1

# Optionally, check the sizes of the splits
print(f"Data with label=0: {len(data_label_0)} rows")
print(f"Data with label=1: {len(data_label_1)} rows")

Data with label=0: 140 rows
Data with label=1: 98 rows


In [13]:
# LLM Libs & Setup
from openai import OpenAI
import json
from pydantic import BaseModel, Field
from typing import List, Optional
import textwrap

## LLM Setup

In [14]:
# Set your Together API Key directly
TOGETHER_API_KEY = ""

# Check if the API key is set
if not TOGETHER_API_KEY:
    raise ValueError("The Together API Key must be provided.")

# Initialize the Together client
client = OpenAI(
    base_url="https://api.together.xyz/v1",
    api_key=TOGETHER_API_KEY
)

# Define the model
model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

## System_Prompt + JSON_schema Setup

In [16]:
SYSTEM_PROMPT = """
System Prompt:

You are an advanced NLP assistant tasked with analyzing tweets about a company. Each tweet is labeled as being from either a bot (0) or a human (1). Your goals are to identify themes, assess sentiment, extract keywords, and assign significance scores. Follow these steps:

Theme Identification:

- Group tweets into 5-15 overall themes or mentions about the company. Themes represent key topics, issues, or narratives surrounding the company.
- For each theme, provide subthemes or keywords that explain why the theme was chosen. These keywords should summarize the driving factors behind the theme.

Sentiment Analysis:

- Perform sentiment analysis for each tweet and aggregate these sentiments to determine the overall sentiment for each theme (positive, neutral, or negative).
- Provide an average sentiment score (-1 to +1 scale) for each theme.

Theme Significance:

- Assign a significance score (1-100) for each theme based on how many tweets contribute to it, and other relevant metrics (e.g., impact of subthemes, relevance to overall narrative).

Output Structure:

For each theme, provide:
- Theme title.
- Sentiment summary (positive/negative/neutral, average sentiment score).
- Subthemes/keywords explaining the causation of the theme.
- Significance score.


Consideration:

- Focus on human-labeled tweets (1) for primary analysis but note any notable patterns from bot-labeled tweets (0).
- Highlight anomalies or unexpected findings, if any.
"""


In [17]:
json_schema = {
    "type": "object",
    "properties": {
        "themes": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "theme": {
                        "type": "string",
                        "description": "Title of the overall theme"
                    },
                    "sentiment": {
                        "type": "object",
                        "properties": {
                            "overall": {
                                "type": "string",
                                "enum": ["positive", "neutral", "negative"],
                                "description": "Overall sentiment of the theme"
                            },
                            "average_score": {
                                "type": "number",
                                "minimum": -1,
                                "maximum": 1,
                                "description": "Average sentiment score of the theme"
                            }
                        },
                        "required": ["overall", "average_score"]
                    },
                    "subthemes": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "List of keywords or subthemes driving the main theme"
                    },
                    "significance": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 100,
                        "description": "Significance score of the theme"
                    }
                },
                "required": ["theme", "sentiment", "subthemes", "significance"]
            }
        }
    },
    "required": ["themes"]
}


In [18]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Sentiment(BaseModel):
    overall: str = Field(
        description="Overall sentiment of the theme",
        enum=["positive", "neutral", "negative"]
    )
    average_score: float = Field(
        description="Average sentiment score of the theme",
        ge=-1,
        le=1
    )

class Theme(BaseModel):
    theme: str = Field(description="Title of the overall theme")
    sentiment: Sentiment = Field(description="Sentiment of the theme")
    subthemes: List[str] = Field(
        description="List of keywords or subthemes driving the main theme"
    )
    significance: int = Field(
        description="Significance score of the theme",
        ge=1,
        le=100
    )

class TweetAnalysis(BaseModel):
    themes: List[Theme] = Field(
        description="List of identified overall themes",
        min_items=1
    )

# Dynamically generate the JSON schema
json_schema = TweetAnalysis.model_json_schema()


/Users/markushenriksson/Desktop/Data/Python/DDBMS-Submission/.conda/lib/python3.12/site-packages/pydantic/fields.py:1032: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'enum'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  warn(
/Users/markushenriksson/Desktop/Data/Python/DDBMS-Submission/.conda/lib/python3.12/site-packages/pydantic/fields.py:1001: PydanticDeprecatedSince20: `min_items` is deprecated and will be removed, use `min_length` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/


## LLM-runtime

In [19]:
import tiktoken
import pandas as pd
import json
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import threading




def rate_limited_executor(max_calls_per_minute=100):
    """Rate-limited executor for API calls."""
    call_queue = Queue()




    def worker():
        while True:
            func, args, kwargs, result_queue = call_queue.get()
            try:
                result_queue.put(func(*args, **kwargs))
            except Exception as e:
                result_queue.put(e)
            call_queue.task_done()
            time.sleep(60 / max_calls_per_minute)




    for _ in range(max_calls_per_minute):
        threading.Thread(target=worker, daemon=True).start()




    def submit(func, *args, **kwargs):
        result_queue = Queue()
        call_queue.put((func, args, kwargs, result_queue))
        result = result_queue.get()
        if isinstance(result, Exception):
            raise result
        return result




    return submit




def process_batch(tweets, executor, model, schema, batch_size):
    """Process one batch of tweets."""
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': json.dumps(tweets)}
    ]
    try:
        response = executor(
            client.chat.completions.create,
            model=model,
            messages=messages,
            temperature=0.7,
            response_format={"type": "json_object", "schema": schema},
            max_tokens=8192
        )
        return {
            'batch_size': len(tweets),
            'response': response.choices[0].message.content
        }
    except Exception as e:
        return {
            'batch_size': len(tweets),
            'error': str(e)
        }




def parallel_process_tweets(data, batch_size, max_workers, model, schema):
    """Parallel processing of tweets with rate-limiting."""
    executor = rate_limited_executor(max_calls_per_minute=60)
    results = []




    def process_and_collect(batch):
        tweets = batch['Tweet_text'].tolist()
        return process_batch(tweets, executor, model, schema, batch_size)




    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        batches = [data.iloc[i:i + batch_size] for i in range(0, len(data), batch_size)]
        futures = [pool.submit(process_and_collect, batch) for batch in batches]
        for future in futures:
            results.append(future.result())




    return pd.DataFrame(results)




def save_results(data, filename):
    """Save data to JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Results saved in {filename}")
    except Exception as e:
        print(f"Error during saving: {e}")




# Assuming 'data_label_0' is your DataFrame filtered for label 0
processed_tweets = parallel_process_tweets(
    data=data_label_0,  # Use the data_label_0 DataFrame
    batch_size=10,      # Batch size
    max_workers=6,     # Number of parallel threads
    model=model,        # Your model variable
    schema=json_schema  # Your JSON schema variable
)




# Save the results to label_0_1.json
save_results(processed_tweets.to_dict(orient="records"), "novo_0.json")

Results saved in novo_0.json


In [20]:
import tiktoken
import pandas as pd
import json
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import threading

def rate_limited_executor(max_calls_per_minute=100):
    """Rate-limited executor for API calls."""
    call_queue = Queue()

    def worker():
        while True:
            func, args, kwargs, result_queue = call_queue.get()
            try:
                result_queue.put(func(*args, **kwargs))
            except Exception as e:
                result_queue.put(e)
            call_queue.task_done()
            time.sleep(60 / max_calls_per_minute)

    for _ in range(max_calls_per_minute):
        threading.Thread(target=worker, daemon=True).start()

    def submit(func, *args, **kwargs):
        result_queue = Queue()
        call_queue.put((func, args, kwargs, result_queue))
        result = result_queue.get()
        if isinstance(result, Exception):
            raise result
        return result

    return submit

def process_batch(tweets, executor, model, schema, batch_size):
    """Process one batch of tweets."""
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': json.dumps(tweets)}
    ]
    try:
        response = executor(
            client.chat.completions.create,
            model=model,
            messages=messages,
            temperature=0.7,
            response_format={"type": "json_object", "schema": schema},
            max_tokens=8192
        )
        return {
            'batch_size': len(tweets),
            'response': response.choices[0].message.content
        }
    except Exception as e:
        return {
            'batch_size': len(tweets),
            'error': str(e)
        }

def parallel_process_tweets(data, batch_size, max_workers, model, schema):
    """Parallel processing of tweets with rate-limiting."""
    executor = rate_limited_executor(max_calls_per_minute=60)
    results = []

    def process_and_collect(batch):
        tweets = batch['Tweet_text'].tolist()
        return process_batch(tweets, executor, model, schema, batch_size)

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        batches = [data.iloc[i:i + batch_size] for i in range(0, len(data), batch_size)]
        futures = [pool.submit(process_and_collect, batch) for batch in batches]
        for future in futures:
            results.append(future.result())

    return pd.DataFrame(results)

def save_results(data, filename):
    """Save data to JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Results saved in {filename}")
    except Exception as e:
        print(f"Error during saving: {e}")

# Assuming 'data_label_1' is your DataFrame filtered for label 1
processed_tweets = parallel_process_tweets(
    data=data_label_1,  # Use the data_label_1 DataFrame
    batch_size=10,      # Batch size
    max_workers=6,      # Number of parallel threads
    model=model,        # Your model variable
    schema=json_schema  # Your JSON schema variable
)

# Save the results to label_1.json
save_results(processed_tweets.to_dict(orient="records"), "novo_1.json")


Results saved in novo_1.json


In [22]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np

def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []

def create_theme_network(data_bot, data_nonbot, min_significance=70):
    """Create a network graph from themes with a minimum significance level."""
    G = nx.Graph()

    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])

                    for theme in themes:
                        significance = theme.get('significance', 0)
                        if significance >= min_significance:
                            theme_name = theme['theme']
                            sentiment = theme.get('sentiment', {})
                            average_score = sentiment.get('average_score', 0)

                            # Classify sentiment based on average score
                            sentiment_class = "positive" if average_score > 0 else "negative"
                            
                            subthemes = theme.get('subthemes', [])

                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           significance=significance,
                                           sentiment_class=sentiment_class,
                                           average_score=average_score,
                                           subthemes=subthemes)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'

                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")

    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)

    return G

def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']

    # Create traces for each category
    traces = []

    # Function to determine node size based on significance
    def get_node_size(significance):
        return 10 + (significance / 5)  # Adjust the scaling factor as needed

    # Color mapping for sentiment and bot type (More distinct colors)
    def get_node_color(is_bot, sentiment_class):
        if is_bot and sentiment_class == 'positive':
            return 'rgb(34,139,34)'  # Positive Bot - Forest Green
        elif is_bot and sentiment_class == 'negative':
            return 'rgb(255,69,0)'  # Negative Bot - Red Orange
        elif not is_bot and sentiment_class == 'positive':
            return 'rgb(30,144,255)'  # Positive Non-Bot - Dodger Blue
        elif not is_bot and sentiment_class == 'negative':
            return 'rgb(255,99,71)'  # Negative Non-Bot - Tomato

    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-10, scale=3, size=len(bot_nodes))  # Increase scale for more spread
        bot_y = np.random.normal(loc=-10, scale=3, size=len(bot_nodes))  # Increase scale for more spread
        bot_sizes = [get_node_size(G.nodes[node]['significance']) for node in bot_nodes]
        bot_colors = [get_node_color(G.nodes[node]['is_bot'], G.nodes[node]['sentiment_class']) for node in bot_nodes]
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Bot<br>"
                f"Significance: {G.nodes[node]['significance']}%<br>"
                f"Subthemes: {', '.join(G.nodes[node].get('subthemes', []))}<br>"
                f"Sentiment: {G.nodes[node]['sentiment_class']}<br>"
                f"Average Score: {G.nodes[node]['average_score']}"
                for node in bot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=bot_sizes,
                color=bot_colors,  # Color based on sentiment
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)

    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=10, scale=3, size=len(nonbot_nodes))  # Increase scale for more spread
        nonbot_y = np.random.normal(loc=10, scale=3, size=len(nonbot_nodes))  # Increase scale for more spread
        nonbot_sizes = [get_node_size(G.nodes[node]['significance']) for node in nonbot_nodes]
        nonbot_colors = [get_node_color(G.nodes[node]['is_bot'], G.nodes[node]['sentiment_class']) for node in nonbot_nodes]
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Non-Bot<br>"
                f"Significance: {G.nodes[node]['significance']}%<br>"
                f"Subthemes: {', '.join(G.nodes[node].get('subthemes', []))}<br>"
                f"Sentiment: {G.nodes[node]['sentiment_class']}<br>"
                f"Average Score: {G.nodes[node]['average_score']}"
                for node in nonbot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=nonbot_sizes,
                color=nonbot_colors,  # Color based on sentiment
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)

    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=3, size=len(mixed_nodes))  # Increase scale for more spread
        mixed_y = np.random.normal(loc=0, scale=3, size=len(mixed_nodes))  # Increase scale for more spread
        mixed_sizes = [get_node_size(G.nodes[node]['significance']) for node in mixed_nodes]
        mixed_colors = [get_node_color(G.nodes[node]['is_bot'], G.nodes[node]['sentiment_class']) for node in mixed_nodes]
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Mixed<br>"
                f"Significance: {G.nodes[node]['significance']}%<br>"
                f"Subthemes: {', '.join(G.nodes[node].get('subthemes', []))}<br>"
                f"Sentiment: {G.nodes[node]['sentiment_class']}<br>"
                f"Average Score: {G.nodes[node]['average_score']}"
                for node in mixed_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=mixed_sizes,
                color=mixed_colors,  # Color based on sentiment
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)

    # Create figure
    fig = go.Figure(data=traces)

    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        xaxis=dict(
            title="X-axis",
            showgrid=False,
            zeroline=False,
            showticklabels=False
        ),
        yaxis=dict(
            title="Y-axis",
            showgrid=False,
            zeroline=False,
            showticklabels=False
        ),
        plot_bgcolor='rgb(240,240,240)',  # Light grey background
        legend=dict(
            x=0.85,
            y=1,
            bgcolor='rgba(255,255,255,0.5)',
            bordercolor='black',
            borderwidth=1
        )
    )

    fig.show()


if __name__ == "__main__":
    # Load bot and non-bot data
    bot_data = load_json_file('novo_0.json')  # Bot data
    nonbot_data = load_json_file('novo_1.json')  # Non-bot data

    # Create and visualize the graph
    graph = create_theme_network(bot_data, nonbot_data, min_significance=70)
    visualize_network(graph)
