In [1]:
!pip install -r requirements.txt -q

In [2]:
# Datahandling
import requests
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import numpy as np
import pandas as pd
import seaborn as sns

# Tweet Themes
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import matplotlib.pyplot as plt
from bertopic import BERTopic
import plotly.express as px

# Network analysis
from typing import List, Optional
import json
import matplotlib.patches as mpatches
import networkx as nx
from community import community_louvain
import plotly.graph_objects as go
import random
from collections import defaultdict
import nbformat

# Model prediction
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import classification_report
import tiktoken

# Gradio deployment
import gradio as gr


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# data sampling? 20000
data = pd.read_csv('TwitterData_Joined.csv')
data = data.sample(n=2000, random_state=42)

In [4]:
# Split the data based on the `label` column
data_label_0 = data[data['Label'] == 0]  # Subset where label=0
data_label_1 = data[data['Label'] == 1]  # Subset where label=1

# Optionally, check the sizes of the splits
print(f"Data with label=0: {len(data_label_0)} rows")
print(f"Data with label=1: {len(data_label_1)} rows")

Data with label=0: 950 rows
Data with label=1: 1050 rows


In [5]:
# LLM Libs & Setup
from openai import OpenAI
import json
from pydantic import BaseModel, Field
from typing import List, Optional
import textwrap

In [6]:
# Set your Together API Key directly
TOGETHER_API_KEY = ""

# Check if the API key is set
if not TOGETHER_API_KEY:
    raise ValueError("The Together API Key must be provided.")

# Initialize the Together client
client = OpenAI(
    base_url="https://api.together.xyz/v1",
    api_key=TOGETHER_API_KEY
)

# Define the model
model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

In [7]:
SYSTEM_PROMPT = f"""
You are an advanced theme extraction and analysis assistant specializing in comprehensive thematic breakdown of a collection of tweets.


Extraction Objectives:
1. Identify and Extract Overall Themes
2. Determine Theme Centrality and Significance
3. Uncover Sub-Themes and Causal Relationships
4. Provide Nuanced Thematic Insights


Output Structure:
{{
    "overall_themes": [
        {{
            "name": "Primary Theme Name",
            "confidence": "Confidence level 0-100",
            "centrality": {{
                "weight": "Theme heaviness/importance (0-100)",
                "calculation_factors": [
                    "Tweet volume",
                    "Sentiment intensity",
                    "Keyword repetition",
                    "Cross-tweet relevance"
                ]
            }},
            "sub_themes": [
                {{
                    "name": "Specific Sub-Theme",
                    "causations": [
                        "Underlying factors driving this sub-theme",
                        "Contextual triggers"
                    ],
                    "keywords": ["Supporting keywords"],
                    "sentiment": "positive/negative/neutral",
                    "confidence": "Sub-theme confidence level 0-100"
                }}
            ],
            "keywords": ["Primary theme defining keywords"],
            "sentiment_distribution": {{
                "positive_percentage": 0,
                "negative_percentage": 0,
                "neutral_percentage": 0
            }}
        }}
    ],
    "metadata": {{
        "total_tweets_analyzed": 0,
        "unique_themes_count": 0,
        "dominant_sentiment": "",
        "language_distribution": {{}}
    }}
}}


Analytical Guidelines:


1. Theme Identification:
- Limit overall themes to a maximum of 10
- Ensure themes are distinct and non-overlapping
- Capture the essence of the tweet collection


2. Centrality and Significance:
- Calculate theme weight using:
  * Number of tweets referencing the theme
  * Sentiment intensity
  * Keyword frequency
  * Cross-tweet relevance
  * Emotional depth


3. Sub-Theme and Causation Analysis:
- Identify specific sub-themes within each overall theme
- Uncover potential causative factors
- Provide context for theme emergence


4. Sentiment and Nuance:
- Analyze emotional tone at theme and sub-theme levels
- Provide percentage-based sentiment distribution
- Highlight emotional complexity


5. Metadata Insights:
- Track total tweets analyzed
- Count unique themes
- Capture language diversity
- Identify dominant sentiment


Principles:
- Objectivity: Base analysis solely on tweet content
- Precision: Provide granular, specific insights
- Consistency: Apply uniform analytical framework
- Depth: Go beyond surface-level theme extraction


Handling Complexity:
- For ambiguous themes, use lower confidence scores
- Highlight keywords that support theme identification
- Provide nuanced explanations for theme relationships


Ethical Considerations:
- Maintain data integrity
- Avoid external assumptions
- Respect contextual limitations of the dataset


Output Requirements:
- Strictly adhere to the specified JSON schema
- Ensure clear, structured, and insightful presentation
"""

In [8]:
json_schema = f"""{{
    "type": "object",
    "properties": {{
        "overall_themes": {{
            "type": "array",
            "maxItems": 10,
            "items": {{
                "type": "object",
                "properties": {{
                    "name": {{
                        "type": "string",
                        "description": "Primary theme name"
                    }},
                    "confidence": {{
                        "type": "number",
                        "minimum": 0,
                        "maximum": 100,
                        "description": "Confidence level of theme identification"
                    }},
                    "centrality": {{
                        "type": "object",
                        "properties": {{
                            "weight": {{
                                "type": "number",
                                "minimum": 0,
                                "maximum": 100,
                                "description": "Theme heaviness/importance"
                            }},
                            "calculation_factors": {{
                                "type": "array",
                                "items": {{
                                    "type": "string"
                                }},
                                "description": "Factors used to calculate theme centrality"
                            }}
                        }},
                        "required": ["weight"]
                    }},
                    "sub_themes": {{
                        "type": "array",
                        "items": {{
                            "type": "object",
                            "properties": {{
                                "name": {{
                                    "type": "string",
                                    "description": "Specific sub-theme name"
                                }},
                                "causations": {{
                                    "type": "array",
                                    "items": {{
                                        "type": "string"
                                    }},
                                    "description": "Underlying factors driving the sub-theme"
                                }},
                                "keywords": {{
                                    "type": "array",
                                    "items": {{
                                        "type": "string"
                                    }},
                                    "description": "Supporting keywords for the sub-theme"
                                }},
                                "sentiment": {{
                                    "type": "string",
                                    "enum": ["positive", "negative", "neutral"],
                                    "description": "Emotional tone of the sub-theme"
                                }},
                                "confidence": {{
                                    "type": "number",
                                    "minimum": 0,
                                    "maximum": 100,
                                    "description": "Confidence level of sub-theme identification"
                                }}
                            }},
                            "required": ["name"]
                        }},
                        "description": "Sub-themes within the primary theme"
                    }},
                    "keywords": {{
                        "type": "array",
                        "items": {{
                            "type": "string"
                        }},
                        "description": "Key terms supporting the primary theme"
                    }},
                    "sentiment_distribution": {{
                        "type": "object",
                        "properties": {{
                            "positive_percentage": {{
                                "type": "number",
                                "minimum": 0,
                                "maximum": 100
                            }},
                            "negative_percentage": {{
                                "type": "number",
                                "minimum": 0,
                                "maximum": 100
                            }},
                            "neutral_percentage": {{
                                "type": "number",
                                "minimum": 0,
                                "maximum": 100
                            }}
                        }}
                    }}
                }},
                "required": ["name", "confidence"]
            }}
        }},
        "metadata": {{
            "type": "object",
            "properties": {{
                "total_tweets_analyzed": {{
                    "type": "integer",
                    "minimum": 0
                }},
                "unique_themes_count": {{
                    "type": "integer",
                    "minimum": 0
                }},
                "dominant_sentiment": {{
                    "type": "string",
                    "enum": ["positive", "negative", "neutral"]
                }},
                "language_distribution": {{
                    "type": "object",
                    "additionalProperties": {{
                        "type": "integer"
                    }}
                }}
            }}
        }}
    }},
    "required": ["overall_themes"]
}}"""

In [9]:
from pydantic import BaseModel, Field
from typing import List, Optional, Dict


class Centrality(BaseModel):
    weight: float = Field(
        description="Theme heaviness/importance",
        ge=0,
        le=100
    )
    calculation_factors: Optional[List[str]] = Field(
        default=None,
        description="Factors used to calculate theme centrality"
    )


class SubTheme(BaseModel):
    name: str = Field(description="Specific sub-theme name")
    causations: Optional[List[str]] = Field(
        default=None,
        description="Underlying factors driving the sub-theme"
    )
    keywords: Optional[List[str]] = Field(
        default=None,
        description="Supporting keywords for the sub-theme"
    )
    sentiment: Optional[str] = Field(
        default=None,
        description="Emotional tone of the sub-theme",
        pattern="^(positive|negative|neutral)$"
    )
    confidence: Optional[float] = Field(
        default=None,
        description="Confidence level of sub-theme identification",
        ge=0,
        le=100
    )


class SentimentDistribution(BaseModel):
    positive_percentage: float = Field(ge=0, le=100)
    negative_percentage: float = Field(ge=0, le=100)
    neutral_percentage: float = Field(ge=0, le=100)


class OverallTheme(BaseModel):
    name: str = Field(description="Primary theme name")
    confidence: float = Field(
        description="Confidence level of theme identification",
        ge=0,
        le=100
    )
    centrality: Optional[Centrality] = Field(
        default=None,
        description="Theme centrality and importance"
    )
    sub_themes: Optional[List[SubTheme]] = Field(
        default=None,
        description="Sub-themes within the primary theme"
    )
    keywords: Optional[List[str]] = Field(
        default=None,
        description="Key terms supporting the primary theme"
    )
    sentiment_distribution: Optional[SentimentDistribution] = Field(
        default=None,
        description="Sentiment distribution for the theme"
    )


class Metadata(BaseModel):
    total_tweets_analyzed: Optional[int] = Field(
        default=None,
        description="Total number of tweets analyzed",
        ge=0
    )
    unique_themes_count: Optional[int] = Field(
        default=None,
        description="Number of unique themes identified",
        ge=0
    )
    dominant_sentiment: Optional[str] = Field(
        default=None,
        description="Overall dominant sentiment",
        pattern="^(positive|negative|neutral)$"
    )
    language_distribution: Optional[Dict[str, int]] = Field(
        default=None,
        description="Distribution of languages in the tweets"
    )


class TweetAnalysis(BaseModel):
    overall_themes: List[OverallTheme] = Field(
        description="List of overall themes",
        max_items=10
    )
    metadata: Optional[Metadata] = Field(
        default=None,
        description="Metadata about the tweet collection"
    )


# Dynamically generate the schema
json_schema = TweetAnalysis.model_json_schema()

/Users/markushenriksson/Desktop/Data/Python/DDBMS-Submission/.conda/lib/python3.11/site-packages/pydantic/fields.py:1007: PydanticDeprecatedSince20: `max_items` is deprecated and will be removed, use `max_length` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/


In [10]:
import tiktoken
import pandas as pd
import json
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import threading




def rate_limited_executor(max_calls_per_minute=100):
    """Rate-limited executor for API calls."""
    call_queue = Queue()




    def worker():
        while True:
            func, args, kwargs, result_queue = call_queue.get()
            try:
                result_queue.put(func(*args, **kwargs))
            except Exception as e:
                result_queue.put(e)
            call_queue.task_done()
            time.sleep(60 / max_calls_per_minute)




    for _ in range(max_calls_per_minute):
        threading.Thread(target=worker, daemon=True).start()




    def submit(func, *args, **kwargs):
        result_queue = Queue()
        call_queue.put((func, args, kwargs, result_queue))
        result = result_queue.get()
        if isinstance(result, Exception):
            raise result
        return result




    return submit




def process_batch(tweets, executor, model, schema, batch_size):
    """Process one batch of tweets."""
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': json.dumps(tweets)}
    ]
    try:
        response = executor(
            client.chat.completions.create,
            model=model,
            messages=messages,
            temperature=0.7,
            response_format={"type": "json_object", "schema": schema},
            max_tokens=8192
        )
        return {
            'batch_size': len(tweets),
            'response': response.choices[0].message.content
        }
    except Exception as e:
        return {
            'batch_size': len(tweets),
            'error': str(e)
        }




def parallel_process_tweets(data, batch_size, max_workers, model, schema):
    """Parallel processing of tweets with rate-limiting."""
    executor = rate_limited_executor(max_calls_per_minute=60)
    results = []




    def process_and_collect(batch):
        tweets = batch['Tweet_text'].tolist()
        return process_batch(tweets, executor, model, schema, batch_size)




    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        batches = [data.iloc[i:i + batch_size] for i in range(0, len(data), batch_size)]
        futures = [pool.submit(process_and_collect, batch) for batch in batches]
        for future in futures:
            results.append(future.result())




    return pd.DataFrame(results)




def save_results(data, filename):
    """Save data to JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Results saved in {filename}")
    except Exception as e:
        print(f"Error during saving: {e}")




# Assuming 'data_label_0' is your DataFrame filtered for label 0
processed_tweets = parallel_process_tweets(
    data=data_label_0,  # Use the data_label_0 DataFrame
    batch_size=10,      # Batch size
    max_workers=64,     # Number of parallel threads
    model=model,        # Your model variable
    schema=json_schema  # Your JSON schema variable
)




# Save the results to label_0_1.json
save_results(processed_tweets.to_dict(orient="records"), "label_0_1.json")

Results saved in label_0_1.json


In [11]:
import tiktoken
import pandas as pd
import json
import time
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import threading
import openai  # Make sure to import openai




def rate_limited_executor(max_calls_per_minute=100):
    """Rate-limited executor for API calls."""
    call_queue = Queue()


    def worker():
        while True:
            func, args, kwargs, result_queue = call_queue.get()
            try:
                result_queue.put(func(*args, **kwargs))
            except Exception as e:
                result_queue.put(e)
            call_queue.task_done()
            time.sleep(60 / max_calls_per_minute)


    for _ in range(max_calls_per_minute):
        threading.Thread(target=worker, daemon=True).start()


    def submit(func, *args, **kwargs):
        result_queue = Queue()
        call_queue.put((func, args, kwargs, result_queue))
        result = result_queue.get()
        if isinstance(result, Exception):
            raise result
        return result


    return submit




def process_batch(tweets, executor, model, schema, batch_size):
    """Process one batch of tweets."""
    messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': json.dumps(tweets)}
    ]
    try:
        response = executor(
            client.chat.completions.create,
            model=model,
            messages=messages,
            temperature=0.7,
            response_format={"type": "json_object", "schema": schema},
            max_tokens=8192
        )
        return {'batch_size': len(tweets), 'response': response.choices[0].message.content}
    except Exception as e:
        return {'batch_size': len(tweets), 'error': str(e)}




def parallel_process_tweets(data, batch_size, max_workers, model, schema):
    """Parallel processing of tweets with rate-limiting."""
    executor = rate_limited_executor(max_calls_per_minute=60)
    results = []


    def process_and_collect(batch):
        tweets = batch['Tweet_text'].tolist()
        return process_batch(tweets, executor, model, schema, batch_size)


    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        batches = [data.iloc[i:i + batch_size] for i in range(0, len(data), batch_size)]
        futures = [pool.submit(process_and_collect, batch) for batch in batches]
        for future in futures:
            results.append(future.result())


    return pd.DataFrame(results)




def save_results(data, filename):
    """Save data to JSON file."""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Results saved in {filename}")
    except Exception as e:
        print(f"Error during saving: {e}")




# Assuming these are already defined in your previous code
# SYSTEM_PROMPT
# client (OpenAI client)
# model
# json_schema
# data_label_1 (DataFrame filtered for label 1)


# Process data_label_1 and save results
processed_tweets_label_1 = parallel_process_tweets(
    data=data_label_1,      # Use the data_label_1 DataFrame
    batch_size=10,          # Batch size
    max_workers=64,         # Number of parallel threads
    model=model,            # Your model variable
    schema=json_schema      # Your JSON schema variable
)


# Save the results to label_1_1.json
save_results(processed_tweets_label_1.to_dict(orient="records"), "label_1_1.json")

Results saved in label_1_1.json


In [12]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np


def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []


def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])


                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            differences = theme.get('differences', [])
                            heaviness = theme.get('heaviness', 0)


                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           differences=differences,
                                           heaviness=heaviness)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'


                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")


    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)


    return G


def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']


    # Create traces for each category
    traces = []


    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-5, scale=2, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-5, scale=2, size=len(bot_nodes))
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers+text',
            text=bot_nodes,
            textposition="top center",
            marker=dict(
                size=10,
                color='rgb(255,0,0)',  # Red
                line=dict(width=2, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)


    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=5, scale=2, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=5, scale=2, size=len(nonbot_nodes))
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers+text',
            text=nonbot_nodes,
            textposition="top center",
            marker=dict(
                size=10,
                color='rgb(0,255,0)',  # Green
                line=dict(width=2, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)


    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=2, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=2, size=len(mixed_nodes))
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers+text',
            text=mixed_nodes,
            textposition="top center",
            marker=dict(
                size=10,
                color='rgb(128,0,128)',  # Purple
                line=dict(width=2, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)


    # Create figure
    fig = go.Figure(data=traces)


    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")


    # Show the figure
    fig.show()


# Load the JSON files
data_label_0 = load_json_file('label_0_1.json')  # Bot data
data_label_1 = load_json_file('label_1_1.json')  # Non-bot data


# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)


# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 0
Bot Themes: 0
Non-Bot Themes: 0
Mixed Themes: 0


In [14]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np




def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []




def create_theme_network(data, min_confidence=50):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    for item in data:
        response = item.get('response')
        if isinstance(response, str):
            try:
                # Parse the response string as a dictionary
                response_data = json.loads(response)
                themes = response_data.get('overall_themes', [])


                for theme in themes:
                    confidence = theme.get('confidence', 0)
                    if confidence >= min_confidence:
                        theme_name = theme['name']
                        centrality = theme.get('centrality', {})
                        keywords = theme.get('keywords', [])
                        sentiment = theme.get('sentiment_distribution', {})


                        # Add node with detailed attributes
                        G.add_node(theme_name, 
                                   confidence=confidence,
                                   centrality_weight=centrality.get('weight', 0),
                                   keywords=keywords,
                                   positive_sentiment=sentiment.get('positive_percentage', 0),
                                   negative_sentiment=sentiment.get('negative_percentage', 0),
                                   neutral_sentiment=sentiment.get('neutral_percentage', 0))


            except (json.JSONDecodeError, TypeError) as e:
                print(f"Error processing response: {e}")


    return G




def safe_color_scale(sentiment):
    """
    Generate a safe color based on sentiment percentage
    Ensures values are between 0 and 255
    """
    # Normalize sentiment to a safe color range
    r = min(max(int(100 + (sentiment * 1.55)), 0), 255)
    g = min(max(int(100 - abs(50 - sentiment)), 0), 255)
    b = min(max(int(100 - (sentiment * 1.55)), 0), 255)
    
    return f'rgb({r},{g},{b})'




def visualize_network(G):
    """Visualize themes with node sizes and colors representing different attributes."""
    # Prepare node attributes
    node_names = list(G.nodes())
    
    if not node_names:
        print("No themes to visualize.")
        return


    node_confidences = [G.nodes[node]['confidence'] for node in node_names]
    node_centralities = [G.nodes[node]['centrality_weight'] for node in node_names]
    
    # Normalize node sizes based on confidence and centrality
    max_confidence = max(node_confidences)
    max_centrality = max(node_centralities)
    
    node_sizes = [
        10 + ((G.nodes[node]['confidence'] / max_confidence) * 30) + 
        ((G.nodes[node]['centrality_weight'] / max_centrality) * 20) 
        for node in node_names
    ]
    
    # Create color gradient based on sentiment
    node_colors = [
        safe_color_scale(G.nodes[node]["positive_sentiment"] / 100) 
        for node in node_names
    ]


    # Create traces
    trace = go.Scatter(
        x=np.random.rand(len(node_names)) * 10,  # Random x positions
        y=np.random.rand(len(node_names)) * 10,  # Random y positions
        mode='markers+text',
        text=node_names,
        textposition="top center",
        marker=dict(
            size=node_sizes,
            color=node_colors,
            line=dict(width=1, color='DarkSlateGrey')
        ),
        hovertext=[
            f"Theme: {node}<br>"
            f"Confidence: {G.nodes[node]['confidence']}%<br>"
            f"Centrality: {G.nodes[node]['centrality_weight']}<br>"
            f"Keywords: {', '.join(G.nodes[node]['keywords'])}<br>"
            f"Sentiment: +{G.nodes[node]['positive_sentiment']}% "
            f"0{G.nodes[node]['neutral_sentiment']}% "
            f"-{G.nodes[node]['negative_sentiment']}%"
            for node in node_names
        ],
        hoverinfo='text'
    )


    # Create figure
    fig = go.Figure(data=[trace])


    # Update layout
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=False,
        hovermode='closest',
        width=1000,
        height=800,
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print theme statistics
    print(f"Total Themes: {len(node_names)}")
    for node in node_names:
        print(f"Theme: {node}")
        print(f"  Confidence: {G.nodes[node]['confidence']}%")
        print(f"  Keywords: {', '.join(G.nodes[node]['keywords'])}")
        print(f"  Sentiment: +{G.nodes[node]['positive_sentiment']}% 0{G.nodes[node]['neutral_sentiment']}% -{ G.nodes[node]['negative_sentiment']}%\n")


    # Show the figure
    fig.show()




# Load the JSON files
data_label_0 = load_json_file('label_0_1.json')  # Bot data
data_label_1 = load_json_file('label_1_1.json')  # Non-bot data


# Combine and process the data
combined_data = data_label_0 + data_label_1


# Create the network
theme_network = create_theme_network(combined_data, min_confidence=50)


# Visualize the network
visualize_network(theme_network)

Total Themes: 301
Theme: Technology and Coding
  Confidence: 85%
  Keywords: VHDL, Verilog, programming, computer science
  Sentiment: +0% 0100% -0%

Theme: Business and Finance
  Confidence: 80%
  Keywords: business, finance, entrepreneurship, venture
  Sentiment: +55% 045% -0%

Theme: Culture and Entertainment
  Confidence: 70%
  Keywords: art, haiku, Funko, Netflix, movie, God, amazing, experiences
  Sentiment: +60% 040% -0%

Theme: Sports and Fitness
  Confidence: 50%
  Keywords: running, 13.1 miles, nychalf, marathon, @eddieizzard
  Sentiment: +100% 00% -0%

Theme: Science and Exploration
  Confidence: 55%
  Keywords: planet, bushes, ringed fields, gopher, fin, science
  Sentiment: +0% 0100% -0%

Theme: Social Interactions
  Confidence: 50%
  Keywords: HeyYou, BoughtIt, ICYMI, RT
  Sentiment: +40% 060% -0%

Theme: Technology and Computing
  Confidence: 70%
  Keywords: Atari, Syzygy, gaming, software
  Sentiment: +20% 060% -20%

Theme: Emotional Expression and Art
  Confidence: 60%

In [15]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np








def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []








def create_theme_network(data_bot, data_nonbot, min_confidence=50):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()




    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    # Parse the response string as a dictionary
                    response_data = json.loads(response)
                    themes = response_data.get('overall_themes', [])




                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            centrality = theme.get('centrality', {})
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment_distribution', {})




                            # Check if theme already exists
                            if theme_name in G:
                                # If theme exists, mark as mixed
                                G.nodes[theme_name]['type'] = 'mixed'
                            else:
                                # Add node with detailed attributes
                                G.add_node(theme_name, 
                                           confidence=confidence,
                                           centrality_weight=centrality.get('weight', 0),
                                           keywords=keywords,
                                           positive_sentiment=sentiment.get('positive_percentage', 0),
                                           negative_sentiment=sentiment.get('negative_percentage', 0),
                                           neutral_sentiment=sentiment.get('neutral_percentage', 0),
                                           type='bot' if is_bot else 'nonbot')




                except (json.JSONDecodeError, TypeError) as e:
                    print(f"Error processing response: {e}")




    # Process both bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)




    return G








def visualize_network(G):
    """Visualize themes with node sizes and colors representing different attributes."""
    # Prepare node attributes
    node_names = list(G.nodes())
    
    if not node_names:
        print("No themes to visualize.")
        return




    # Prepare node attributes
    node_confidences = [G.nodes[node]['confidence'] for node in node_names]
    node_centralities = [G.nodes[node]['centrality_weight'] for node in node_names]
    
    # Normalize node sizes based on confidence and centrality
    max_confidence = max(node_confidences)
    max_centrality = max(node_centralities)
    
    node_sizes = [
        10 + ((G.nodes[node]['confidence'] / max_confidence) * 30) + 
        ((G.nodes[node]['centrality_weight'] / max_centrality) * 20) 
        for node in node_names
    ]
    
    # Color coding for different theme types
    def get_node_color(node):
        theme_type = G.nodes[node].get('type', 'unknown')
        sentiment = G.nodes[node]["positive_sentiment"] / 100
        
        if theme_type == 'bot':
            # Blue shades for bot themes
            r = min(max(int(50 + (sentiment * 155)), 0), 255)
            return f'rgb({r},100,255)'
        elif theme_type == 'nonbot':
            # Red shades for non-bot themes
            g = min(max(int(50 + (sentiment * 155)), 0), 255)
            return f'rgb(255,{g},100)'
        else:
            # Mixed themes in purple
            return 'rgb(128,0,128)'
    
    node_colors = [get_node_color(node) for node in node_names]




    # Create traces
    trace = go.Scatter(
        x=np.random.rand(len(node_names)) * 10,  # Random x positions
        y=np.random.rand(len(node_names)) * 10,  # Random y positions
        mode='markers+text',
        text=node_names,
        textposition="top center",
        marker=dict(
            size=node_sizes,
            color=node_colors,
            line=dict(width=1, color='DarkSlateGrey')
        ),
        hovertext=[
            f"Theme: {node}<br>"
            f"Type: {G.nodes[node].get('type', 'unknown')}<br>"
            f"Confidence: {G.nodes[node]['confidence']}%<br>"
            f"Centrality: {G.nodes[node]['centrality_weight']}<br>"
            f"Keywords: {', '.join(G.nodes[node]['keywords'])}<br>"
            f"Sentiment: +{G.nodes[node]['positive_sentiment']}% "
            f"0{G.nodes[node]['neutral_sentiment']}% "
            f"-{G.nodes[node]['negative_sentiment']}%"
            for node in node_names
        ],
        hoverinfo='text'
    )




    # Create figure
    fig = go.Figure(data=[trace])




    # Update layout
    fig.update_layout(
        title="Theme Network Visualization (Blue: Bot, Red: Non-Bot, Purple: Mixed)",
        showlegend=False,
        hovermode='closest',
        width=1000,
        height=800,
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )




    # Print theme statistics
    print("Theme Statistics:")
    bot_themes = [node for node in node_names if G.nodes[node].get('type') == 'bot']
    nonbot_themes = [node for node in node_names if G.nodes[node].get('type') == 'nonbot']
    mixed_themes = [node for node in node_names if G.nodes[node].get('type') == 'mixed']
    
    print(f"Total Themes: {len(node_names)}")
    print(f"Bot Themes: {len(bot_themes)}")
    print(f"Non-Bot Themes: {len(nonbot_themes)}")
    print(f"Mixed Themes: {len(mixed_themes)}")
    
    print("\nBot Themes:")
    for node in bot_themes:
        print(f"  {node}")
    
    print("\nNon-Bot Themes:")
    for node in nonbot_themes:
        print(f"  {node}")
    
    print("\nMixed Themes:")
    for node in mixed_themes:
        print(f"  {node}")




    # Show the figure
    fig.show()








# Load the JSON files
data_label_0 = load_json_file('label_0_1.json')  # Bot data
data_label_1 = load_json_file('label_1_1.json')  # Non-bot data




# Create the network
theme_network = create_theme_network(data_label_0, data_label_1, min_confidence=50)




# Visualize the network
visualize_network(theme_network)

Theme Statistics:
Total Themes: 301
Bot Themes: 80
Non-Bot Themes: 176
Mixed Themes: 45

Bot Themes:
  Technology and Coding
  Culture and Entertainment
  Science and Exploration
  Emotional Expression and Art
  Philosophy and Culture
  Philosophical Musings and Wordplay
  Health and Fitness
  Emotions and Mental State
  Weather and Environment
  Social Interactions and Relationships
  Tech and Software
  Security and Vulnerability
  Cryptocurrency
  Love and Relationships
  Inspiration and Creativity
  Realism and Optimism
  Social Interaction
  Emotional Support and Concern
  Life and Relationships
  Celebrations and Hobbies
  Gaming
  Politics
  Science Fiction
  Humor
  Bitcoin Miners
  Space Exploration and Discovery
  Pop Culture and Entertainment
  Language and Sarcasm
  Criticisms and Negativity
  Lightheartedness and Humor
  Politics and Justice
  Lifestyle and Interests
  Uncategorized
  Social Media and Communication
  Inspirational/Motivational
  Politics and World Events
 

In [18]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np


def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []


def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])


                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']


                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'


                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")


    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)


    return G


def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']


    # Create traces for each category
    traces = []


    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers+text',
            text=bot_nodes,
            textposition="top center",
            marker=dict(
                size=8,  # Smaller size
                color='rgb(255,0,0)',  # Red
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)


    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers+text',
            text=nonbot_nodes,
            textposition="top center",
            marker=dict(
                size=8,  # Smaller size
                color='rgb(0,255,0)',  # Green
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)


    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers+text',
            text=mixed_nodes,
            textposition="top center",
            marker=dict(
                size=8,  # Smaller size
                color='rgb(128,0,128)',  # Purple
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)


    # Create figure
    fig = go.Figure(data=traces)


    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")


    # Show the figure
    fig.show()


# Load the JSON files
data_label_0 = load_json_file('label_0.json')  # Bot data
data_label_1 = load_json_file('label_1.json')  # Non-bot data


# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)


# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 214
Bot Themes: 112
Non-Bot Themes: 90
Mixed Themes: 12


In [21]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np








def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []








def create_theme_network(data_bot, data_nonbot, min_confidence=50):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()




    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    # Parse the response string as a dictionary
                    response_data = json.loads(response)
                    themes = response_data.get('overall_themes', [])




                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            centrality = theme.get('centrality', {})
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment_distribution', {})




                            # Check if theme already exists
                            if theme_name in G:
                                # If theme exists, mark as mixed
                                G.nodes[theme_name]['type'] = 'mixed'
                            else:
                                # Add node with detailed attributes
                                G.add_node(theme_name, 
                                           confidence=confidence,
                                           centrality_weight=centrality.get('weight', 0),
                                           keywords=keywords,
                                           positive_sentiment=sentiment.get('positive_percentage', 0),
                                           negative_sentiment=sentiment.get('negative_percentage', 0),
                                           neutral_sentiment=sentiment.get('neutral_percentage', 0),
                                           type='bot' if is_bot else 'nonbot')




                except (json.JSONDecodeError, TypeError) as e:
                    print(f"Error processing response: {e}")




    # Process both bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)




    return G








def visualize_network(G):
    """Visualize themes with node sizes and colors representing different attributes."""
    # Prepare node attributes
    node_names = list(G.nodes())
    
    if not node_names:
        print("No themes to visualize.")
        return




    # Prepare node attributes
    node_confidences = [G.nodes[node]['confidence'] for node in node_names]
    node_centralities = [G.nodes[node]['centrality_weight'] for node in node_names]
    
    # Normalize node sizes based on confidence and centrality
    max_confidence = max(node_confidences)
    max_centrality = max(node_centralities)
    
    node_sizes = [
        10 + ((G.nodes[node]['confidence'] / max_confidence) * 30) + 
        ((G.nodes[node]['centrality_weight'] / max_centrality) * 20) 
        for node in node_names
    ]
    
    # Color coding for different theme types
    def get_node_color(node):
        theme_type = G.nodes[node].get('type', 'unknown')
        sentiment = G.nodes[node]["positive_sentiment"] / 100
        
        if theme_type == 'bot':
            # Blue shades for bot themes
            r = min(max(int(50 + (sentiment * 155)), 0), 255)
            return f'rgb({r},100,255)'
        elif theme_type == 'nonbot':
            # Red shades for non-bot themes
            g = min(max(int(50 + (sentiment * 155)), 0), 255)
            return f'rgb(255,{g},100)'
        else:
            # Mixed themes in purple
            return 'rgb(128,0,128)'
    
    node_colors = [get_node_color(node) for node in node_names]




    # Create traces
    trace = go.Scatter(
        x=np.random.rand(len(node_names)) * 10,  # Random x positions
        y=np.random.rand(len(node_names)) * 10,  # Random y positions
        mode='markers+text',
        text=node_names,
        textposition="top center",
        marker=dict(
            size=node_sizes,
            color=node_colors,
            line=dict(width=1, color='DarkSlateGrey')
        ),
        hovertext=[
            f"Theme: {node}<br>"
            f"Type: {G.nodes[node].get('type', 'unknown')}<br>"
            f"Confidence: {G.nodes[node]['confidence']}%<br>"
            f"Centrality: {G.nodes[node]['centrality_weight']}<br>"
            f"Keywords: {', '.join(G.nodes[node]['keywords'])}<br>"
            f"Sentiment: +{G.nodes[node]['positive_sentiment']}% "
            f"0{G.nodes[node]['neutral_sentiment']}% "
            f"-{G.nodes[node]['negative_sentiment']}%"
            for node in node_names
        ],
        hoverinfo='text'
    )




    # Create figure
    fig = go.Figure(data=[trace])




    # Update layout
    fig.update_layout(
        title="Theme Network Visualization (Blue: Bot, Red: Non-Bot, Purple: Mixed)",
        showlegend=False,
        hovermode='closest',
        width=1000,
        height=800,
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )




    # Print theme statistics
    print("Theme Statistics:")
    bot_themes = [node for node in node_names if G.nodes[node].get('type') == 'bot']
    nonbot_themes = [node for node in node_names if G.nodes[node].get('type') == 'nonbot']
    mixed_themes = [node for node in node_names if G.nodes[node].get('type') == 'mixed']
    
    print(f"Total Themes: {len(node_names)}")
    print(f"Bot Themes: {len(bot_themes)}")
    print(f"Non-Bot Themes: {len(nonbot_themes)}")
    print(f"Mixed Themes: {len(mixed_themes)}")
    
    print("\nBot Themes:")
    for node in bot_themes:
        print(f"  {node}")
    
    print("\nNon-Bot Themes:")
    for node in nonbot_themes:
        print(f"  {node}")
    
    print("\nMixed Themes:")
    for node in mixed_themes:
        print(f"  {node}")




    # Show the figure
    fig.show()








# Load the JSON files
data_label_0 = load_json_file('label_0_1.json')  # Bot data
data_label_1 = load_json_file('label_1_1.json')  # Non-bot data




# Create the network
theme_network = create_theme_network(data_label_0, data_label_1, min_confidence=50)




# Visualize the network
visualize_network(theme_network)

Theme Statistics:
Total Themes: 301
Bot Themes: 80
Non-Bot Themes: 176
Mixed Themes: 45

Bot Themes:
  Technology and Coding
  Culture and Entertainment
  Science and Exploration
  Emotional Expression and Art
  Philosophy and Culture
  Philosophical Musings and Wordplay
  Health and Fitness
  Emotions and Mental State
  Weather and Environment
  Social Interactions and Relationships
  Tech and Software
  Security and Vulnerability
  Cryptocurrency
  Love and Relationships
  Inspiration and Creativity
  Realism and Optimism
  Social Interaction
  Emotional Support and Concern
  Life and Relationships
  Celebrations and Hobbies
  Gaming
  Politics
  Science Fiction
  Humor
  Bitcoin Miners
  Space Exploration and Discovery
  Pop Culture and Entertainment
  Language and Sarcasm
  Criticisms and Negativity
  Lightheartedness and Humor
  Politics and Justice
  Lifestyle and Interests
  Uncategorized
  Social Media and Communication
  Inspirational/Motivational
  Politics and World Events
 

In [22]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np




def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []




def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()




    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])




                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment', {})




                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           keywords=keywords,
                                           sentiment=sentiment)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'




                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")




    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)




    return G




def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']




    # Create traces for each category
    traces = []




    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in bot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=8,  # Smaller size
                color='rgb(255,0,0)',  # Red
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)




    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Non-Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in nonbot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=8,  # Smaller size
                color='rgb(0,255,0)',  # Green
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)




    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Mixed<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in mixed_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=8,  # Smaller size
                color='rgb(128,0,128)',  # Purple
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)




    # Create figure
    fig = go.Figure(data=traces)




    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )




    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")




    # Show the figure
    fig.show()




# Load the JSON files
data_label_0 = load_json_file('label_0.json')  # Bot data
data_label_1 = load_json_file('label_1.json')  # Non-bot data




# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)




# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 214
Bot Themes: 112
Non-Bot Themes: 90
Mixed Themes: 12


In [23]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np




def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []




def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])


                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment', {})


                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           keywords=keywords,
                                           sentiment=sentiment)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'


                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")


    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)


    return G




def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']


    # Create traces for each category
    traces = []


    # Function to determine node size based on confidence
    def get_node_size(confidence):
        return 5 + confidence / 10  # Adjust the scaling factor as needed


    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_sizes = [get_node_size(G.nodes[node]['confidence']) for node in bot_nodes]
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in bot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=bot_sizes,
                color='rgb(255,0,0)',  # Red
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)


    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_sizes = [get_node_size(G.nodes[node]['confidence']) for node in nonbot_nodes]
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Non-Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in nonbot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=nonbot_sizes,
                color='rgb(0,255,0)',  # Green
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)


    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_sizes = [get_node_size(G.nodes[node]['confidence']) for node in mixed_nodes]
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Mixed<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in mixed_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=mixed_sizes,
                color='rgb(128,0,128)',  # Purple
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)


    # Create figure
    fig = go.Figure(data=traces)


    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")


    # Show the figure
    fig.show()




# Load the JSON files
data_label_0 = load_json_file('label_0.json')  # Bot data
data_label_1 = load_json_file('label_1.json')  # Non-bot data


# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)


# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 214
Bot Themes: 112
Non-Bot Themes: 90
Mixed Themes: 12


In [24]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np


def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []


def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])
                    
                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment', {})


                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           keywords=keywords,
                                           sentiment=sentiment)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'


                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")


    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)


    return G


def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in clustered regions."""
    # Calculate centrality
    try:
        centrality = nx.eigenvector_centrality(G)
    except nx.PowerIterationFailedConvergence:
        # Fallback to degree centrality if eigenvector centrality fails
        centrality = nx.degree_centrality(G)


    # Normalize centrality for sizing
    max_centrality = max(centrality.values()) if centrality else 1
    normalized_centrality = {node: (cent / max_centrality * 30) for node, cent in centrality.items()}


    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']


    # Create traces for each category
    traces = []


    # Bot nodes - clustered in bottom left
    if bot_nodes:
        bot_x = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-5, scale=1, size=len(bot_nodes))
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in bot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in bot_nodes],
                color='rgb(255,0,0)',  # Red
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)


    # Non-bot nodes - clustered in top right
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=5, scale=1, size=len(nonbot_nodes))
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Non-Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in nonbot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in nonbot_nodes],
                color='rgb(0,255,0)',  # Green
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)


    # Mixed nodes - clustered in center
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=1, size=len(mixed_nodes))
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Mixed<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in mixed_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in mixed_nodes],
                color='rgb(128,0,128)',  # Purple
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)


    # Create figure
    fig = go.Figure(data=traces)


    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization (90%+ Confidence)",
        showlegend=True,
        hovermode='closest',
        width=1000,
        height=800,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-10, 10]
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")


    # Show the figure
    fig.show()


# Load the JSON files
data_label_0 = load_json_file('label_0.json')  # Bot data
data_label_1 = load_json_file('label_1.json')  # Non-bot data


# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)


# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 214
Bot Themes: 112
Non-Bot Themes: 90
Mixed Themes: 12


In [28]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np


def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []


def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()


    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('themes', [])
                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment', {})
                            # Add or update theme node
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           keywords=keywords,
                                           sentiment=sentiment)
                            else:
                                # If node exists and bot status is different, mark as mixed
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'
                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")


    # Process bot and non-bot data
    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)


    return G


def visualize_network(G):
    """Visualize bot, non-bot, and mixed themes in spread out regions."""
    # Check if the graph is empty
    if len(G.nodes) == 0:
        print("No nodes to visualize.")
        return


    # Calculate centrality
    try:
        centrality = nx.eigenvector_centrality(G)
    except nx.PowerIterationFailedConvergence:
        # Fallback to degree centrality if eigenvector centrality fails
        centrality = nx.degree_centrality(G)


    # Normalize centrality for sizing
    max_centrality = max(centrality.values()) if centrality else 1
    normalized_centrality = {node: (cent / max_centrality * 30) for node, cent in centrality.items()}


    # Separate nodes by bot status
    bot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == True]
    nonbot_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == False]
    mixed_nodes = [node for node, data in G.nodes(data=True) if data.get('is_bot') == 'mixed']


    # Create traces for each category
    traces = []


    # Bot nodes - clustered in bottom left with increased spacing
    if bot_nodes:
        bot_x = np.random.normal(loc=-10, scale=2, size=len(bot_nodes))
        bot_y = np.random.normal(loc=-10, scale=2, size=len(bot_nodes))
        bot_trace = go.Scatter(
            x=bot_x,
            y=bot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in bot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in bot_nodes],
                color='rgb(255,0,0)',  # Red
                line=dict(width=1, color='rgb(139,0,0)')  # Dark red
            ),
            name='Bot Themes'
        )
        traces.append(bot_trace)


    # Non-bot nodes - clustered in top right with increased spacing
    if nonbot_nodes:
        nonbot_x = np.random.normal(loc=10, scale=2, size=len(nonbot_nodes))
        nonbot_y = np.random.normal(loc=10, scale=2, size=len(nonbot_nodes))
        nonbot_trace = go.Scatter(
            x=nonbot_x,
            y=nonbot_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Non-Bot<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in nonbot_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in nonbot_nodes],
                color='rgb(0,255,0)',  # Green
                line=dict(width=1, color='rgb(0,100,0)')  # Dark green
            ),
            name='Non-Bot Themes'
        )
        traces.append(nonbot_trace)


    # Mixed nodes - clustered in center with increased spacing
    if mixed_nodes:
        mixed_x = np.random.normal(loc=0, scale=2, size=len(mixed_nodes))
        mixed_y = np.random.normal(loc=0, scale=2, size=len(mixed_nodes))
        mixed_trace = go.Scatter(
            x=mixed_x,
            y=mixed_y,
            mode='markers',
            hovertext=[
                f"Theme: {node}<br>"
                f"Type: Mixed<br>"
                f"Confidence: {G.nodes[node]['confidence']}%<br>"
                f"Centrality: {centrality.get(node, 0):.4f}<br>"
                f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
                f"Sentiment: {G.nodes[node].get('sentiment', 'N/A')}"
                for node in mixed_nodes
            ],
            hoverinfo='text',
            marker=dict(
                size=[normalized_centrality.get(node, 5) for node in mixed_nodes],
                color='rgb(128,0,128)',  # Purple
                line=dict(width=1, color='rgb(75,0,130)')  # Indigo
            ),
            name='Mixed Themes'
        )
        traces.append(mixed_trace)


    # Create figure
    fig = go.Figure(data=traces)


    # Update layout for better visualization
    fig.update_layout(
        title="Theme Network Visualization (90%+ Confidence)",
        showlegend=True,
        hovermode='closest',
        width=1200,
        height=1000,
        margin=dict(b=20, l=20, r=20, t=60),
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-15, 15]  # Expanded range
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-15, 15]  # Expanded range
        ),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )


    # Print statistics
    print(f"Total Themes: {len(G.nodes())}")
    print(f"Bot Themes: {len(bot_nodes)}")
    print(f"Non-Bot Themes: {len(nonbot_nodes)}")
    print(f"Mixed Themes: {len(mixed_nodes)}")


    # Show the figure
    fig.show()


# Load the JSON files
data_label_0 = load_json_file('label_0.json')  # Bot data
data_label_1 = load_json_file('label_1.json')  # Non-bot data


# Create the network
high_confidence_network = create_theme_network(data_label_0, data_label_1, min_confidence=90)


# Visualize the network
visualize_network(high_confidence_network)

Total Themes: 214
Bot Themes: 112
Non-Bot Themes: 90
Mixed Themes: 12


In [31]:
import json
import networkx as nx
import plotly.graph_objects as go
import numpy as np


def load_json_file(file_path):
    """Load JSON file safely."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file {file_path}: {e}")
        return []


def create_theme_network(data_bot, data_nonbot, min_confidence=90):
    """Create a network graph from themes with a minimum confidence level."""
    G = nx.Graph()

    def process_data(data, is_bot=True):
        for item in data:
            response = item.get('response')
            if isinstance(response, str):
                try:
                    response_data = json.loads(response)
                    themes = response_data.get('overall_themes', [])
                    for theme in themes:
                        confidence = theme.get('confidence', 0)
                        if confidence >= min_confidence:
                            theme_name = theme['name']
                            keywords = theme.get('keywords', [])
                            sentiment = theme.get('sentiment', 'neutral')
                            if not G.has_node(theme_name):
                                G.add_node(theme_name,
                                           type='theme',
                                           is_bot=is_bot,
                                           confidence=confidence,
                                           keywords=keywords,
                                           sentiment=sentiment)
                            else:
                                current_bot_status = G.nodes[theme_name].get('is_bot')
                                if current_bot_status != is_bot:
                                    G.nodes[theme_name]['is_bot'] = 'mixed'
                except json.JSONDecodeError:
                    print(f"Invalid JSON in response: {response}")

    process_data(data_bot, is_bot=True)
    process_data(data_nonbot, is_bot=False)
    return G


def visualize_network(G):
    """Visualize network clustered by bot and nonbot themes with zoom functionality."""
    if len(G.nodes) == 0:
        print("No nodes to visualize.")
        return

    # Assign positions to cluster bot, nonbot, and mixed themes
    bot_positions = np.random.uniform(-10, -5, (len([n for n, d in G.nodes(data=True) if d['is_bot'] == True]), 2))
    nonbot_positions = np.random.uniform(5, 10, (len([n for n, d in G.nodes(data=True) if d['is_bot'] == False]), 2))
    mixed_positions = np.random.uniform(-2, 2, (len([n for n, d in G.nodes(data=True) if d['is_bot'] == 'mixed']), 2))

    # Map positions back to nodes
    node_positions = {}
    for i, node in enumerate([n for n, d in G.nodes(data=True) if d['is_bot'] == True]):
        node_positions[node] = bot_positions[i]
    for i, node in enumerate([n for n, d in G.nodes(data=True) if d['is_bot'] == False]):
        node_positions[node] = nonbot_positions[i]
    for i, node in enumerate([n for n, d in G.nodes(data=True) if d['is_bot'] == 'mixed']):
        node_positions[node] = mixed_positions[i]

    # Define categories and colors
    categories = {
        'bot': {'nodes': [], 'color': 'rgb(255,0,0)', 'positions': []},
        'nonbot': {'nodes': [], 'color': 'rgb(0,0,255)', 'positions': []},
        'mixed': {'nodes': [], 'color': 'rgb(0,128,0)', 'positions': []},
    }

    for node, data in G.nodes(data=True):
        category = 'mixed' if data['is_bot'] == 'mixed' else 'bot' if data['is_bot'] else 'nonbot'
        categories[category]['nodes'].append(node)
        categories[category]['positions'].append(node_positions[node])

    # Create traces for each category
    traces = []
    for category, details in categories.items():
        x, y = zip(*details['positions']) if details['positions'] else ([], [])
        hover_text = [
            f"Theme: {node}<br>"
            f"Type: {category.capitalize()}<br>"
            f"Confidence: {G.nodes[node]['confidence']}%<br>"
            f"Keywords: {', '.join(G.nodes[node].get('keywords', []))}<br>"
            f"Sentiment: {G.nodes[node].get('sentiment', 'neutral')}"
            for node in details['nodes']
        ]
        traces.append(go.Scatter(
            x=x,
            y=y,
            mode='markers',
            marker=dict(
                size=10,
                color=details['color'],
                line=dict(width=1, color='black')
            ),
            hovertext=hover_text,
            hoverinfo='text',
            name=f"{category.capitalize()} Themes"
        ))

    # Create the figure
    fig = go.Figure(data=traces)
    fig.update_layout(
        title="Theme Network Visualization (Clustered by Bot Status)",
        showlegend=True,
        hovermode='closest',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        width=1200,
        height=1000,
        margin=dict(l=20, r=20, t=50, b=20),
        plot_bgcolor='white',
        paper_bgcolor='white',
        dragmode='pan'
    )

    print(f"Total Nodes: {len(G.nodes())}")
    print(f"Bot Themes: {len(categories['bot']['nodes'])}")
    print(f"Nonbot Themes: {len(categories['nonbot']['nodes'])}")
    print(f"Mixed Themes: {len(categories['mixed']['nodes'])}")

    fig.show()


# Load JSON files
data_label_0 = load_json_file('label_0_1.json')  # Bot data
data_label_1 = load_json_file('label_1_1.json')  # Non-bot data

# Create network
network = create_theme_network(data_label_0, data_label_1, min_confidence=90)

# Visualize network
visualize_network(network)


Total Nodes: 43
Bot Themes: 7
Nonbot Themes: 34
Mixed Themes: 2
