In [1]:
import pandas as pd
import sys
import os

# Add the scripts directory to the Python path
sys.path.append(os.path.abspath('../scripts'))

#  Data scraping, collection pipeline, Data Cleaning and Transformation

In [2]:
# Import required libraries
import nest_asyncio
import asyncio
from telegram_scraper import scrape_multiple_channels, clean_all_scraped_data

# Apply nest_asyncio to handle nested loops in Jupyter notebooks
nest_asyncio.apply()

# List of Telegram channels to scrape
channels = [
    'https://t.me/DoctorsET',
    'https://t.me/lobelia4cosmetics',
    'https://t.me/CheMed123',
    'https://t.me/yetenaweg',
    'https://t.me/EAHCI'
]

# Asynchronous function to scrape multiple channels and display results
async def run_scraper():
    # Run the asynchronous scrape function and print the results
    scraped_data = await scrape_multiple_channels(channels)
    print("Scraped data:")
    print(scraped_data.head(20))
    return scraped_data

# Run the scraper and clean the data within the notebook
scraped_data = asyncio.run(run_scraper())

# Clean all scraped data and save the results in 'cleaned_data' folder
clean_all_scraped_data()


2024-10-16 10:20:36,713 - INFO - Successfully scraped 100 messages from https://t.me/mychannel
2024-10-16 10:20:36,714 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2024-10-16 10:20:36,945 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2024-10-16 10:20:38,849 - INFO - Disconnecting from 149.154.167.92:443/TcpFull...
2024-10-16 10:20:38,851 - INFO - Disconnection from 149.154.167.92:443/TcpFull complete!
2024-10-16 10:20:38,874 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2024-10-16 10:20:39,120 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2024-10-16 10:20:40,383 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-16 10:20:48,259 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-16 10:20:48,881 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-16 10:21:24,979 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072


Scraped data:
              channel_name  message_id                      date  \
0   https://t.me/DoctorsET         864 2023-12-18 17:04:02+00:00   
1   https://t.me/DoctorsET         863 2023-11-03 16:14:39+00:00   
2   https://t.me/DoctorsET         862 2023-10-02 16:37:39+00:00   
3   https://t.me/DoctorsET         861 2023-09-16 07:54:32+00:00   
4   https://t.me/DoctorsET         860 2023-09-01 16:16:15+00:00   
5   https://t.me/DoctorsET         859 2023-08-29 17:20:05+00:00   
6   https://t.me/DoctorsET         848 2022-08-02 17:42:08+00:00   
7   https://t.me/DoctorsET         847 2022-06-12 17:15:47+00:00   
8   https://t.me/DoctorsET         846 2022-05-31 17:51:13+00:00   
9   https://t.me/DoctorsET         845 2022-05-20 18:04:53+00:00   
10  https://t.me/DoctorsET         844 2022-05-15 15:59:10+00:00   
11  https://t.me/DoctorsET         843 2022-05-07 18:22:14+00:00   
12  https://t.me/DoctorsET         842 2022-05-06 17:51:05+00:00   
13  https://t.me/DoctorsET        

Unnamed: 0,channel_name,message_id,date,message,image_path
0,https://t.me/CheMed123,97,2023-02-10,"⚠️Notice!\nDear esteemed customers,\nDue to fo...",images\CheMed123\CheMed123_0.jpg
1,https://t.me/CheMed123,96,2023-02-02,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,images\CheMed123\CheMed123_1.jpg
2,https://t.me/CheMed123,95,2023-02-01,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,images\CheMed123\CheMed123_2.jpg
3,https://t.me/CheMed123,94,2023-01-31,Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...,images\CheMed123\CheMed123_3.jpg
4,https://t.me/CheMed123,93,2023-01-30,"Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...",images\CheMed123\CheMed123_4.jpg
...,...,...,...,...,...
471,https://t.me/yetenaweg,1080,2024-06-09,Unknown,Unknown
472,https://t.me/yetenaweg,1079,2024-06-06,Unknown,Unknown
473,https://t.me/yetenaweg,1078,2024-06-06,📣በዚህ እሁድ በቴሌግራም ቀጥታ ስርጭት የኩላሊት በሽታ ክትትል እና የዳያ...,Unknown
474,https://t.me/yetenaweg,1077,2024-06-05,https://www.clubhouse.com/room/xlOVk34E?utm_me...,Unknown


In [3]:
print(scraped_data.tail(10000))

               channel_name  message_id                      date  \
0    https://t.me/DoctorsET         864 2023-12-18 17:04:02+00:00   
1    https://t.me/DoctorsET         863 2023-11-03 16:14:39+00:00   
2    https://t.me/DoctorsET         862 2023-10-02 16:37:39+00:00   
3    https://t.me/DoctorsET         861 2023-09-16 07:54:32+00:00   
4    https://t.me/DoctorsET         860 2023-09-01 16:16:15+00:00   
..                      ...         ...                       ...   
471      https://t.me/EAHCI        2309 2024-08-11 17:58:29+00:00   
472      https://t.me/EAHCI        2308 2024-08-11 17:58:29+00:00   
473      https://t.me/EAHCI        2307 2024-08-11 15:45:44+00:00   
474      https://t.me/EAHCI        2306 2024-08-11 15:43:59+00:00   
475      https://t.me/EAHCI        2305 2024-08-10 17:09:43+00:00   

                                               message image_path  \
0    https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...       None   
1    ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌ

In [4]:
# from telegram_scraper import store_combined_data_in_db,log_scraping_activity
# db_url = 'postgresql://postgres:admin@localhost:5432/datawarehouse'
# store_combined_data_in_db(scraped_data, 'telegram_data', db_url)

# # Step 4: Run DBT for Data Transformation
# import subprocess
# subprocess.run(['dbt', 'run'], check=True)
# log_scraping_activity('All Channels', len(scraped_data))


# Object Detection Using YOLO


In [5]:
import torch
import cv2
import pandas as pd
import logging
import os
from sqlalchemy import create_engine
from matplotlib import pyplot as plt

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load the YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Load the DataFrame containing image paths (adjust the path accordingly)
df = pd.read_csv('combined_cleaned_data.csv')

# Prepare a list to store detection results
detection_results = []

# Directory to save images with detections
output_dir = "detected_images"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process each image path
for index, row in df.iterrows():
    image_path = row['image_path']
    channel_name = row['channel_name']
    
    # Skip if image path is invalid
    if image_path == 'Unknown' or pd.isnull(image_path) or not os.path.exists(image_path):
        logger.warning(f"Image not found or marked as 'Unknown': {image_path}. Skipping.")
        continue
    
    try:
        # Load the image
        img = cv2.imread(image_path)
        if img is None:
            logger.warning(f"Failed to load image: {image_path}. Skipping.")
            continue
        
        # Run object detection using YOLO
        results = model(img)
        
        # Parse and save the detection results
        for *box, conf, cls in results.xyxy[0]:
            class_name = model.names[int(cls.item())]  # Get the class name using the YOLO model's class names
            detection_data = {
                'channel_name': channel_name,
                'image': str(image_path),
                'x_min': box[0].item(),
                'y_min': box[1].item(),
                'x_max': box[2].item(),
                'y_max': box[3].item(),
                'confidence': conf.item(),
                'class': int(cls.item()),
                'class_name': class_name
            }
            detection_results.append(detection_data)

            # Draw bounding box on the image
            cv2.rectangle(img, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (255, 0, 0), 2)
            cv2.putText(img, f'{class_name} {conf.item():.2f}', (int(box[0]), int(box[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
        
        # Save the image with detections
        output_image_path = os.path.join(output_dir, f"detected_{os.path.basename(image_path)}")
        cv2.imwrite(output_image_path, img)
        
        # Display the image with detections
        plt.figure(figsize=(8, 6))
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Detected Objects in {os.path.basename(image_path)}")
        plt.axis('off')
        plt.show()
        
    except Exception as e:
        logger.error(f"Error processing image {image_path}: {e}")

# Save detection results to CSV
detection_df = pd.DataFrame(detection_results)
detection_df.to_csv('detection_results.csv', index=False)

print("Object detection completed and results saved to 'detection_results.csv'")

Using cache found in C:\Users\Naim/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-16 Python-3.11.9 torch-2.4.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  plt.figure(figsize=(8, 6))
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.au

Object detection completed and results saved to 'detection_results.csv'


In [6]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv('.env')
db_url = os.getenv('DB_URL')
from telegram_scraper import store_detection_results_in_db
# Function to store detection results in a database
# Store the detection results in the database
store_detection_results_in_db(detection_df, 'detection_results', db_url)

Detection results successfully stored in table 'detection_results'


In [7]:
detection_df.head()

Unnamed: 0,channel_name,image,x_min,y_min,x_max,y_max,confidence,class,class_name
0,https://t.me/CheMed123,images\CheMed123\CheMed123_1.jpg,493.153656,653.479858,594.194031,754.130615,0.324055,32,sports ball
1,https://t.me/CheMed123,images\CheMed123\CheMed123_2.jpg,11.339487,241.373627,1017.9776,1072.088135,0.34767,0,person
2,https://t.me/CheMed123,images\CheMed123\CheMed123_2.jpg,11.129631,258.760651,476.896118,1056.915771,0.259493,0,person
3,https://t.me/CheMed123,images\CheMed123\CheMed123_3.jpg,3.337908,626.411011,389.522675,854.017029,0.663924,0,person
4,https://t.me/CheMed123,images\CheMed123\CheMed123_3.jpg,705.543274,549.123047,855.402222,831.217773,0.630349,75,vase
