# Data Cleaning and Transformation


In [5]:
import pandas as pd
# Load scraped data
df = pd.read_csv('telegram_raw_data.csv')

## Remove rows with missing text
df = df.dropna(subset=['text'])

## Remove duplicates
df = df.drop_duplicates(subset=['text'])

# Standardize formats
df['text'] = df['text'].str.lower().str.strip()  # Lowercase and strip text
if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Normalize date format
if 'message_id' in df.columns: df['message_id'] = pd.to_numeric(df['message_id'], errors='coerce')  # Ensure numeric

# Data Validation
df = df.dropna(subset=['text']).drop_duplicates(subset=['text'])  # Remove missing and duplicate rows
df = df[df['text'].str.len() > 5]  # Filter entries with text length > 5

# Save cleaned data
df.to_csv('cleaned_telegram_data.csv', index=False)

# Monitoring and Logging:



### Setup the Logging Configuration

In [6]:
import logging
from datetime import datetime

# Configure the logging system
logging.basicConfig(
    filename=f'telegram_scraping_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filemode='w'  # 'w' to overwrite the file, 'a' to append
)

# Create logger (optional if you're using root logger directly)
logger = logging.getLogger(__name__)

###  Logging During the Scraping Process

In [None]:
try:
    # Start of the scraping process
    logger.info('Starting Telegram scraping process')
    
    # Authentication
    logger.info('Authenticating with Telegram API')
    client.start(phone_number)
    
    # Scraping each channel
    for channel in ['DoctorsET', 'Chemed', 'lobelia4cosmetics', 'yetenaweg', 'EAHCI']:
        logger.info(f'Starting to scrape channel: {channel}')
        try:
            messages = client(GetHistoryRequest(peer=channel, limit=500))
            logger.info(f'Successfully scraped {len(messages)} messages from {channel}')
            # Here you would process the messages
        except Exception as e:
            logger.error(f'Error while scraping {channel}: {e}', exc_info=True)
    
    logger.info('Scraping process completed successfully')

except Exception as e:
    logger.critical('Unexpected error occurred during scraping', exc_info=True)

finally:
    logger.info('Cleaning up resources')
    client.disconnect()  # Assuming 'client' is your TelegramClient instance

### Monitoring Progress

In [8]:
# Example if you're processing a large number of messages
total_messages = 0
for channel in channels:
    messages = client(GetHistoryRequest(peer=channel, limit=500))
    total_messages += len(messages)
    logger.info(f'Processed {total_messages} messages so far')

NameError: name 'channels' is not defined

In [10]:
from telethon import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
import logging

# Configure the logging system
logging.basicConfig(
    filename=f'telegram_scraping_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filemode='w'
)
logger = logging.getLogger(__name__)

# Define your TelegramClient here
api_id = '21423729'  # Replace with your actual API ID
api_hash = '60c830a40a5ff9cb549824b9a6399737'  # Replace with your actual API hash
phone_number = '+251986408483'  # Replace with your phone number
client = TelegramClient('session_name', api_id, api_hash)

# Authentication
client.start(phone_number)

# Define your list of channels
channels = ['DoctorsET', 'Chemed', 'lobelia4cosmetics', 'yetenaweg', 'EAHCI']

# Example if you're processing a large number of messages
total_messages = 0
for channel in channels:
    try:
        messages = client(GetHistoryRequest(peer=channel, limit=500))
        total_messages += len(messages)
        logger.info(f'Processed {total_messages} messages so far')
    except Exception as e:
        logger.error(f'Error processing channel {channel}: {e}', exc_info=True)

# Cleanup
client.disconnect()

OperationalError: database is locked