# Task-1: Exploratory Data Analysis (EDA)- Text Analysis

In [None]:
# -----------------------------
# Imports
# -----------------------------
import sys, pathlib
# if notebook is in project_root/notebooks
sys.path.insert(0, str(pathlib.Path.cwd().parent))  # adds project_root
# or if you want to import modules inside src directly:
# sys.path.insert(0, str(pathlib.Path.cwd().parent / "src"))

import os
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

# Import your EDA functions
from src.eda_functions import load_data, clean_data

# Download stopwords if necessary
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# -----------------------------
# 1. Text preprocessing
# -----------------------------
def preprocess_text(text):
    """
    Clean text: lowercase, remove punctuation, numbers, stopwords
    """
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)                 # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = ' '.join([w for w in text.split() if w not in stop_words])  # remove stopwords
    return text

# -----------------------------
# 2. Keyword / phrase extraction
# -----------------------------
def extract_common_keywords(texts, top_n=20):
    """
    texts: list of strings
    top_n: number of most common keywords/phrases to return
    """
    cleaned_texts = [preprocess_text(t) for t in texts]
    
    # Count unigrams and bigrams
    vectorizer = CountVectorizer(ngram_range=(1,2), max_features=1000)
    X = vectorizer.fit_transform(cleaned_texts)
    
    word_counts = X.sum(axis=0)
    words = vectorizer.get_feature_names_out()
    counts = [(word, word_counts[0, idx]) for idx, word in enumerate(words)]
    counts = sorted(counts, key=lambda x: x[1], reverse=True)
    
    return counts[:top_n]

# -----------------------------
# 3. Topic extraction using LDA
# -----------------------------
def extract_topics(texts, n_topics=5, n_words=10):
    """
    Extract latent topics from a list of texts using LDA.
    """
    cleaned_texts = [preprocess_text(t) for t in texts]
    vectorizer = CountVectorizer(max_features=2000, stop_words='english')
    X = vectorizer.fit_transform(cleaned_texts)
    
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    
    words = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_features = [words[i] for i in topic.argsort()[:-n_words - 1:-1]]
        topics.append((topic_idx+1, top_features))
    return topics

# -----------------------------
# 4. Example workflow
# -----------------------------
if __name__ == "__main__":
    # Load and clean your raw data
    raw_file = r"D:\Python\Week-1\Data-Week-1\raw_analyst_ratings.csv"
    df = load_data(raw_file)
    df = clean_data(df)
    
    # Use 'headline' column (or replace with your relevant text column)
    headlines = df['headline'].tolist()
 

## Extract top keywords/phrases

In [None]:
   
    # Extract top keywords/phrases
top_keywords = extract_common_keywords(headlines, top_n=20)
print("Top Keywords / Phrases:")
for kw, count in top_keywords:
        print(f"{kw}: {count}")
 

## Extract topics

In [None]:
   
    # Extract topics
topics = extract_topics(headlines, n_topics=5, n_words=5)
print("\nExtracted Topics:")
for topic_id, words in topics:
        print(f"Topic {topic_id}: {words}")
