In [None]:
# Importing necessary libraries for data analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2

In [None]:
# Setting up the visualizations to be more attractive
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# Load the cleaned CSV data (local) or load from database
cleaned_data_path = '../data/cleaned/cleaned_data.csv'
df = pd.read_csv(cleaned_data_path)

In [None]:
# Quick summary and overview of the data
df.info()
df.head()

In [None]:
# Checking for missing values in the cleaned data
missing_values = df.isnull().sum()
print("Missing Values per Column:\n", missing_values)

In [None]:
# Visualizing the distribution of messages over time
df['date'] = pd.to_datetime(df['date'])  # Convert date column to datetime
df['year_month'] = df['date'].dt.to_period('M')  # Group by month

plt.figure(figsize=(10,6))
sns.countplot(data=df, x='year_month', palette='Blues_d')
plt.xticks(rotation=45)
plt.title('Monthly Message Count from Telegram Channels')
plt.show()

In [None]:
# Analyzing the top businesses mentioned in messages (if business names are available)
if 'business_name' in df.columns:
    top_businesses = df['business_name'].value_counts().head(10)
    plt.figure(figsize=(10,6))
    sns.barplot(x=top_businesses.index, y=top_businesses.values, palette='Blues_d')
    plt.title('Top 10 Most Mentioned Businesses in Telegram Channels')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Connect to PostgreSQL database for more advanced analysis
def connect_db():
    try:
        conn = psycopg2.connect(
            dbname="medical_data", user="postgres", password="password", host="localhost"
        )
        print("Database connection successful")
        return conn
    except Exception as e:
        print(f"Failed to connect to database: {e}")

In [None]:
# Querying the PostgreSQL database to retrieve business data for analysis
conn = connect_db()
if conn:
    query = """
        SELECT name, location, COUNT(*) as message_count
        FROM medical_businesses
        GROUP BY name, location
        ORDER BY message_count DESC
        LIMIT 10;
    """
    business_data = pd.read_sql(query, conn)

In [None]:
  # Plotting top 10 businesses by message count
    plt.figure(figsize=(10,6))
    sns.barplot(data=business_data, x='name', y='message_count', palette='Blues_d')
    plt.title('Top 10 Businesses by Number of Mentions in Telegram Channels')
    plt.xticks(rotation=45)
    plt.show()

    conn.close()

In [None]:
# Analyzing the detected objects from YOLO (object detection results)
image_data_path = '../data/images/results/summary.csv'
image_df = pd.read_csv(image_data_path)

In [None]:
# Top detected objects in images
top_objects = image_df['object_name'].value_counts().head(10)
plt.figure(figsize=(10,6))
sns.barplot(x=top_objects.index, y=top_objects.values, palette='Blues_d')
plt.title('Top 10 Objects Detected in Scraped Images')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualizing YOLO detection confidence levels
plt.figure(figsize=(10,6))
sns.histplot(image_df['confidence'], kde=True, bins=30, color='blue')
plt.title('Distribution of YOLO Object Detection Confidence Levels')
plt.show()