#### Creating a data warehouse for Ethiopian medical business data scraped from Telegram channels

In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')
import pandas as pd


## Data scraping and collection pipeline

In [2]:
from scripts.createW_H_Impl import start_scraping


In [3]:
channel = ['@DoctorsET','@lobelia4cosmetics','@yetenaweg','@EAHCI','@CheMed123']


In [4]:
start_scraping(channel)


2024-10-15 16:38:11,573 - INFO - Scrapping data...
2024-10-15 16:38:11,575 - INFO - Connecting to 149.154.167.92:443/TcpFull...


2024-10-15 16:38:11,677 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2024-10-15 16:38:12,591 - INFO - Finished scraping @DoctorsET


Scraping historical data from @DoctorsET (Doctors Ethiopia)...


2024-10-15 16:38:12,898 - INFO - Finished scraping @lobelia4cosmetics


Scraping historical data from @lobelia4cosmetics (Lobelia pharmacy and cosmetics)...
Scraping historical data from @yetenaweg (የጤና ወግ - የጤና መረጃ)...


2024-10-15 16:38:13,230 - INFO - Finished scraping @yetenaweg


Scraping historical data from @EAHCI (ETHIO-AMERICAN MEDICAL TRAININGS( CPD ) & HEALTH CONSULTANCY CENTER)...


2024-10-15 16:38:13,555 - INFO - Finished scraping @EAHCI


Scraping historical data from @CheMed123 (CheMed)...


2024-10-15 16:38:13,893 - INFO - Finished scraping @CheMed123
2024-10-15 16:38:13,894 - INFO - Listening for real-time messages...


2024-10-15 16:48:53,623 - INFO - Got difference for account updates
2024-10-15 16:52:56,535 - INFO - Got difference for channel 2197711891 updates
2024-10-15 16:56:13,588 - INFO - Got difference for channel 2197711891 updates
2024-10-15 16:56:42,431 - INFO - Got difference for channel 2197711891 updates
2024-10-15 16:57:51,201 - INFO - Got difference for channel 2197711891 updates
2024-10-15 16:59:11,211 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:01:13,721 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:03:13,923 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:09:06,604 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:14:01,229 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:19:25,821 - INFO - Got difference for channel 2197711891 updates
2024-10-15 17:26:36,483 - INFO - Got difference for account updates
2024-10-15 17:34:26,093 - INFO - Got difference for channel 2197711891 upd

### Read scrapped datasets

In [5]:
scrappedData = pd.read_csv('telegram_data.csv')


In [6]:
scrappedData.head(10)


Unnamed: 0,message_date,message_id,message_description
0,2023-12-18 17:04:02,864,በቀን አንዴ ብቻ የሚባለው የቢዝነስ አማካሪ በ 10 000 ብር ብቻ የተጀ...
1,2023-11-03 16:14:39,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...
2,2023-10-02 16:37:39,862,ሞት በስኳር ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ይ...
3,2023-09-16 07:54:32,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ? ሙሉ ቪ...
4,2023-09-01 16:16:15,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ Homosex...
5,2023-08-29 17:20:05,859,ዶክተርስ ኢትዮጽያ በአዲስ ፕሮገራም ጀመረ ማረጥ ሜኖፖዝ ጋር ተያይዞ የሚ...
6,2022-08-02 17:42:08,848,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...
7,2022-06-12 17:15:47,847,ስፖርት የመስራት ሱስ ይኖር ይሆን? በአሁኑ ወቅት ብዙ የስፖርት መስሪያ ...
8,2022-05-31 17:51:13,846,ድንገተኛ አደጋ / የአጥንት ስብራት አያርገውና ድንገተኛ የሆነ አደጋ ቢደ...
9,2022-05-20 18:04:53,845,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...


## Data Cleaning and Transformation

### Data Cleaning

In [7]:
scrappedData.isnull().sum()


message_date           0
message_id             0
message_description    0
dtype: int64

In [8]:
scrappedData.isna().sum()


message_date           0
message_id             0
message_description    0
dtype: int64

In [9]:
scrappedData.drop_duplicates(inplace=True)
scrappedData.dropna(inplace=True)


In [10]:
scrappedData.tail(10)


Unnamed: 0,message_date,message_id,message_description
96,2023-01-04 05:58:02,77,Nature Made Vitamin supplements Che-Med በደምበኞቻ...
97,2023-01-03 17:49:48,76,በ መድሃኒትዎን የሚያዙበትን መንገድ የሚያሳይ tiktok ቪዲዮ።
98,2023-01-03 05:48:34,75,በChe-Med የጥሪ ማዕከል 9798 ላይ ደውለው በመመዝገብ በየወሩ የሚያ...
99,2022-12-30 15:45:35,71,Che-Med at Smart city symposium National scien...
100,2022-12-28 17:02:08,70,Buy you Power Plus supplement from Che-Med. Fo...
101,2022-12-28 06:31:50,67,Order your Power plus Vitamin and mineral supp...
102,2022-12-27 17:06:32,64,Che-Med Che-Med የመድሀኒትና የህክምና እቃዎች አፋላጊ እና አቅራ...
103,2022-12-25 15:46:05,62,የምስራች ከChe-Med ።።።።።።Che-Med አስታዋሽ።።።።።። Che-M...
104,2022-12-23 06:26:15,60,ቀጠሮ ያስይዙ በ ጥሪ ማዕከላችን 9798 በመደወል የዕንቅርት ህመም መቆጣ...
105,2022-12-22 06:40:25,59,ቀጠሮ ያስይዙ በ ጥሪ ማዕከላችን 9798 በመደወል የግፊት መድሃኒትዎን በ...


## DBT -> Data Build Tool


### Install Libraries

pip install dbt-core

pip install dbt-postgres

dbt init creating_data_ware_house


### Add necessarily file in dbt

cd creating_data_ware_house

add profiles.yml file

configure my profile with dbt_project.yml

create sql and schema file in model

### Add csv file into seeds folder

dbt seed

### Test config

dbt debug

dbt run

dbt test

### Generate and Serve docs

dbt docs generate

dbt docs serve

# Object Detection Using YOLO

pip install opencv-python torch torchvision tensorflow


git clone https://github.com/ultralytics/yolov5.git
cd yolov5
pip install -r requirements.txt


python detect.py --source data/images/ --weights yolov5s.pt --conf 0.4 --save-txt --save-conf


In [11]:
import os
import pandas as pd

results = []
for filename in os.listdir('yolov5/runs/detect/exp/labels'):
    if filename.endswith('.txt'):
        with open(f'yolov5/runs/detect/exp/labels/{filename}') as f:
            for line in f:
                class_id, x_center, y_center, width, height, conf = line.strip().split()
                results.append([filename, class_id, x_center, y_center, width, height, conf])

df = pd.DataFrame(results, columns=['Image', 'Class', 'X_Center', 'Y_Center', 'Width', 'Height', 'Confidence'])
df.to_csv('detection_results.csv', index=False)


In [12]:
detectedImage= pd.read_csv('detection_results.csv')


In [13]:
detectedImage.head(10)


Unnamed: 0,Image,Class,X_Center,Y_Center,Width,Height,Confidence
0,@lobelia4cosmetics_12471.txt,0,0.467553,0.517969,0.259578,0.376563,0.892511
1,@lobelia4cosmetics_12480.txt,73,0.506348,0.943359,0.516602,0.105469,0.575717
2,@CheMed123_71.txt,0,0.101562,0.403125,0.116319,0.1125,0.827497
3,@CheMed123_71.txt,0,0.496528,0.552344,0.416667,0.565625,0.93587
4,@lobelia4cosmetics_12479.txt,41,0.431525,0.79888,0.242894,0.250646,0.789149
5,@CheMed123_67.txt,11,0.145833,0.876852,0.152778,0.209259,0.859066
6,@lobelia4cosmetics_12470.txt,39,0.614453,0.573047,0.249219,0.727344,0.442317
7,@CheMed123_81.txt,39,0.518056,0.734259,0.219444,0.437037,0.483794
8,@CheMed123_81.txt,39,0.718518,0.758796,0.2,0.397222,0.655023
9,@CheMed123_81.txt,39,0.316667,0.755093,0.198148,0.386111,0.732168


In [14]:
detectedImage['Image'] = detectedImage['Image'].str.replace('@', '', regex=False)
detectedImage['Image'] = detectedImage['Image'].str.replace('.txt', '', regex=False)


In [16]:
from src.connectDatabase import export_detection_image_to_psql
export_detection_image_to_psql(detectedImage)


2024-10-15 16:38:14,137 - INFO - Creating table detected_image...
2024-10-15 16:38:14,157 - INFO - Table creation attempted.
2024-10-15 16:38:14,158 - INFO - Processing row 0: lobelia4cosmetics_12471
2024-10-15 16:38:14,159 - INFO - Data inserted for row 0.
2024-10-15 16:38:14,161 - INFO - Processing row 1: lobelia4cosmetics_12480
2024-10-15 16:38:14,162 - INFO - Data inserted for row 1.
2024-10-15 16:38:14,163 - INFO - Processing row 2: CheMed123_71
2024-10-15 16:38:14,164 - INFO - Data inserted for row 2.
2024-10-15 16:38:14,165 - INFO - Processing row 3: CheMed123_71
2024-10-15 16:38:14,166 - INFO - Data inserted for row 3.
2024-10-15 16:38:14,167 - INFO - Processing row 4: lobelia4cosmetics_12479
2024-10-15 16:38:14,169 - INFO - Data inserted for row 4.
2024-10-15 16:38:14,170 - INFO - Processing row 5: CheMed123_67
2024-10-15 16:38:14,171 - INFO - Data inserted for row 5.
2024-10-15 16:38:14,172 - INFO - Processing row 6: lobelia4cosmetics_12470
2024-10-15 16:38:14,173 - INFO - Da

In [19]:
from src.connectDatabase import fetch_data_from_database
dataFromDatabase=fetch_data_from_database()



In [20]:
dataFromDatabase.head(10)


Unnamed: 0,id,image,class,x_center,y_center,width,height,confidence
0,1,lobelia4cosmetics_12471,0.0,0.467553,0.517969,0.259578,0.376563,0.892511
1,2,lobelia4cosmetics_12480,73.0,0.506348,0.943359,0.516602,0.105469,0.575717
2,3,CheMed123_71,0.0,0.101562,0.403125,0.116319,0.1125,0.827497
3,4,CheMed123_71,0.0,0.496528,0.552344,0.416667,0.565625,0.93587
4,5,lobelia4cosmetics_12479,41.0,0.431525,0.79888,0.242894,0.250646,0.789149
5,6,CheMed123_67,11.0,0.145833,0.876852,0.152778,0.209259,0.859066
6,7,lobelia4cosmetics_12470,39.0,0.614453,0.573047,0.249219,0.727344,0.442317
7,8,CheMed123_81,39.0,0.518056,0.734259,0.219444,0.437037,0.483794
8,9,CheMed123_81,39.0,0.718518,0.758796,0.2,0.397222,0.655023
9,10,CheMed123_81,39.0,0.316667,0.755093,0.198148,0.386111,0.732168
