#### Creating a data warehouse for Ethiopian medical business data scraped from Telegram channels

In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')
import pandas as pd


## Data scraping and collection pipeline

In [2]:
from scripts.createW_H_Impl import start_scraping


In [3]:
channel = ['@DoctorsET','@lobelia4cosmetics','@yetenaweg','@EAHCI','@CheMed123']


In [4]:
start_scraping(channel)


2024-10-11 16:51:35,307 - INFO - Scrapping data...
2024-10-11 16:51:35,309 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2024-10-11 16:51:35,442 - INFO - Connection to 149.154.167.92:443/TcpFull complete!


Scraping historical data from @DoctorsET (Doctors Ethiopia)...


2024-10-11 16:51:36,883 - INFO - Finished scraping @DoctorsET


Scraping historical data from @lobelia4cosmetics (Lobelia pharmacy and cosmetics)...


2024-10-11 16:51:37,423 - INFO - Finished scraping @lobelia4cosmetics


Scraping historical data from @yetenaweg (የጤና ወግ - የጤና መረጃ)...


2024-10-11 16:51:37,852 - INFO - Finished scraping @yetenaweg


Scraping historical data from @EAHCI (ETHIO-AMERICAN MEDICAL TRAININGS( CPD ) & HEALTH CONSULTANCY CENTER)...


2024-10-11 16:51:38,381 - INFO - Finished scraping @EAHCI


Scraping historical data from @CheMed123 (CheMed)...


2024-10-11 16:51:38,791 - INFO - Finished scraping @CheMed123
2024-10-11 16:51:38,791 - INFO - Listening for real-time messages...


2024-10-11 16:55:57,837 - INFO - Got difference for account updates
2024-10-11 17:00:37,828 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:02:38,115 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:10:58,245 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:16:21,373 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:22:19,322 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:29:08,418 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:36:39,670 - INFO - Closing current connection to begin reconnect...
2024-10-11 17:36:39,680 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2024-10-11 17:36:39,869 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2024-10-11 17:37:38,867 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:38:40,651 - INFO - Got difference for channel 2197711891 updates
2024-10-11 17:39:40,001 - INFO - Got difference for chann

### Read scrapped datasets

In [5]:
scrappedData = pd.read_csv('telegram_data.csv')


In [6]:
scrappedData.head(10)


Unnamed: 0,message_date,message_id,message_description
0,2023-12-18 17:04:02,864,በቀን አንዴ ብቻ የሚባለው የቢዝነስ አማካሪ በ 10 000 ብር ብቻ የተጀ...
1,2023-11-03 16:14:39,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...
2,2023-10-02 16:37:39,862,ሞት በስኳር ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ይ...
3,2023-09-16 07:54:32,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ? ሙሉ ቪ...
4,2023-09-01 16:16:15,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ Homosex...
5,2023-08-29 17:20:05,859,ዶክተርስ ኢትዮጽያ በአዲስ ፕሮገራም ጀመረ ማረጥ ሜኖፖዝ ጋር ተያይዞ የሚ...
6,2022-08-02 17:42:08,848,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...
7,2022-06-12 17:15:47,847,ስፖርት የመስራት ሱስ ይኖር ይሆን? በአሁኑ ወቅት ብዙ የስፖርት መስሪያ ...
8,2022-05-31 17:51:13,846,ድንገተኛ አደጋ / የአጥንት ስብራት አያርገውና ድንገተኛ የሆነ አደጋ ቢደ...
9,2022-05-20 18:04:53,845,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...


## Data Cleaning and Transformation

### Data Cleaning

In [7]:
scrappedData.isnull().sum()


message_date           0
message_id             0
message_description    0
dtype: int64

In [8]:
scrappedData.isna().sum()


message_date           0
message_id             0
message_description    0
dtype: int64

In [9]:
scrappedData.drop_duplicates(inplace=True)
scrappedData.dropna(inplace=True)


In [10]:
scrappedData.tail(10)


Unnamed: 0,message_date,message_id,message_description
96,2023-01-04 05:58:02,77,Nature Made Vitamin supplements Che-Med በደምበኞቻ...
97,2023-01-03 17:49:48,76,በ መድሃኒትዎን የሚያዙበትን መንገድ የሚያሳይ tiktok ቪዲዮ።
98,2023-01-03 05:48:34,75,በChe-Med የጥሪ ማዕከል 9798 ላይ ደውለው በመመዝገብ በየወሩ የሚያ...
99,2022-12-30 15:45:35,71,Che-Med at Smart city symposium National scien...
100,2022-12-28 17:02:08,70,Buy you Power Plus supplement from Che-Med. Fo...
101,2022-12-28 06:31:50,67,Order your Power plus Vitamin and mineral supp...
102,2022-12-27 17:06:32,64,Che-Med Che-Med የመድሀኒትና የህክምና እቃዎች አፋላጊ እና አቅራ...
103,2022-12-25 15:46:05,62,የምስራች ከChe-Med ።።።።።።Che-Med አስታዋሽ።።።።።። Che-M...
104,2022-12-23 06:26:15,60,ቀጠሮ ያስይዙ በ ጥሪ ማዕከላችን 9798 በመደወል የዕንቅርት ህመም መቆጣ...
105,2022-12-22 06:40:25,59,ቀጠሮ ያስይዙ በ ጥሪ ማዕከላችን 9798 በመደወል የግፊት መድሃኒትዎን በ...


## DBT -> Data Build Tool
