In [1]:
import asyncio
import os
import re
import nest_asyncio
from telethon import TelegramClient
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  # Import VADER
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Patch the event loop to handle nested async calls (for Jupyter or running loops)
nest_asyncio.apply()

# Replace these with your actual credentials
api_id = YOUR_API_ID  # Replace with your Telegram API ID (integer)
api_hash = 'YOUR_API_HASH'  # Replace with your API Hash (string)
phone_number = 'YOUR_PHONE_NUMBER'  # Replace with your phone number including the country code
session_name = 'telegram_session'  # Name of the session file
channel_username = 'CHANNEL_NAME'  # Replace with the desired Telegram channel username

# Function to clean session file if needed
def clean_session_file():
    session_file = f"{session_name}.session"
    if os.path.exists(session_file):
        print(f"Deleting old session file: {session_file} to avoid database lock.")
        os.remove(session_file)

# Step 1: Function to scrape Telegram messages
async def scrape_telegram():
    clean_session_file()  # Clean up session to avoid lock errors

    try:
        # Create the TelegramClient instance with the session name, api_id, and api_hash
        async with TelegramClient(session_name, api_id, api_hash) as client:
            print("Connecting to Telegram...")

            # If you are logging in for the first time, use this to log in using the phone number
            if not await client.is_user_authorized():
                await client.send_code_request(phone_number)
                await client.sign_in(phone_number, input('Enter the code: '))

            # Fetch and print messages from the specified channel
            messages = []
            async for message in client.iter_messages(channel_username, limit=100):
                if message.text:  # Ensure message text exists
                    messages.append(message.text)

            if not messages:
                print("No messages found. Ensure the channel username is correct and accessible.")
            else:
                print(f"Scraped {len(messages)} messages.")  # Print total number of scraped messages
            return messages
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Step 2: Preprocess messages (remove unwanted characters, clean up text)
def preprocess_messages(messages):
    cleaned_messages = []
    for message in messages:
        # Remove URLs, hashtags, mentions, and emojis
        message = re.sub(r"http\S+|www\S+|https\S+", "", message)  # Remove URLs
        message = re.sub(r"#\S+", "", message)  # Remove hashtags
        message = re.sub(r"@\S+", "", message)  # Remove mentions
        message = re.sub(r"[^\w\s]", "", message)  # Remove punctuation and emojis
        
        # Convert to lowercase and remove extra spaces
        message = message.lower().strip()
        
        # Keep only stock-related messages (this can be done by filtering keywords like 'buy', 'sell', etc.)
        if 'stock' in message or 'buy' in message or 'sell' in message or 'price' in message:
            cleaned_messages.append(message)
    
    print(f"Cleaned Messages (Count: {len(cleaned_messages)}):", cleaned_messages[:5])  # Print cleaned messages
    return cleaned_messages

# Step 3: Perform sentiment analysis (using VADER for financial sentiment)
def perform_sentiment_analysis(cleaned_messages):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = []
    for message in cleaned_messages:
        sentiment_score = analyzer.polarity_scores(message)['compound']
        sentiments.append(sentiment_score)
    
    print(f"Sentiment Scores (Count: {len(sentiments)}):", sentiments[:5])  # Print first 5 sentiment scores
    return sentiments

# Step 4: Feature engineering (e.g., extract sentiment and keyword count)
def extract_features_from_messages(cleaned_messages, sentiments):
    features = []
    for message, sentiment_score in zip(cleaned_messages, sentiments):
        keyword_count = message.count('stock')  # Example: count 'stock' in the message
        features.append([sentiment_score, keyword_count])
    
    # Convert features to DataFrame for easier manipulation
    feature_df = pd.DataFrame(features, columns=['sentiment', 'keyword_count'])
    print(f"Feature DataFrame (Count: {len(feature_df)}):\n", feature_df.head())  # Print the first few rows of the feature dataframe
    return feature_df

# Step 5: Build and evaluate the model (using Logistic Regression)
def build_and_evaluate_model(feature_df):
    if len(feature_df) < 2:
        print("Not enough data to build and evaluate the model.")
        return

    # Create a sample label (0 or 1) to predict stock movement (this would typically come from historical data)
    labels = [1 if x % 2 == 0 else 0 for x in range(len(feature_df))]

    X = feature_df[['sentiment', 'keyword_count']]  # Features
    y = labels  # Labels
    
    # Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Check if we have enough data after splitting
    if X_train.empty or X_test.empty:
        print("Insufficient data after splitting into train and test sets.")
        return
    
    # Build a logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Main function to integrate everything
async def main():
    # Step 1: Scrape Telegram messages
    messages = await scrape_telegram()
    
    if not messages:
        print("No messages to process.")
        return
    
    # Step 2: Preprocess messages
    cleaned_messages = preprocess_messages(messages)
    
    # Step 3: Perform sentiment analysis
    sentiments = perform_sentiment_analysis(cleaned_messages)
    
    # Step 4: Feature engineering
    feature_df = extract_features_from_messages(cleaned_messages, sentiments)
    
    # Step 5: Build and evaluate the model
    build_and_evaluate_model(feature_df)

# Run the main function
await main()  # Use await to run the asynchronous function in Jupyter Notebooks


Deleting old session file: telegram_session.session to avoid database lock.


Attempt 1 at connecting failed: TimeoutError: 


Please enter your phone (or bot token):  +917829563977
Please enter the code you received:  69109


Signed in successfully as Kruthika sathyanarayana; remember to not break the ToS or you will risk an account ban!
Connecting to Telegram...
Scraped 44 messages.
Cleaned Messages (Count: 15): ['fundflow  net sell 184987 crore while  net buy 248181 crore in equities today provisional', 'us stock markets today\n\nsp 500 opens 033 down nasdaq falls 042', 'types of ev stocks', 'capital formation companies can raise capital by selling shares to investors this capital can be used to fund growth research and development and other business activities\n economic indicator the stock market is often seen as a barometer of the overall economy a rising market can signal economic growth while a falling market can indicate economic problems\n investment opportunities the stock market offers individuals and institutions the opportunity to invest in a wide range of companies and industries\n\ninvesting in the stock market\n\ninvesting in the stock market can be a rewarding experience but it also involve