# Tf-idf approach

In [0]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
import numpy as np
import string
import re

nltk.download('punkt')  # Download punkt tokenizer

def preprocess_text(text):
    """
    Preprocess the input text by removing punctuation, converting to lowercase,
    and stripping out any extra whitespace.
    """
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text.lower()

def summarize_text(file_path, num_sentences=3):
    """
    Summarizes a text file by identifying the most important sentences using TF-IDF.

    Args:
    - file_path: Path to the input text file.
    - num_sentences: Number of sentences to include in the summary.

    Returns:
    - A summary containing the most relevant sentences.
    """
    with open(file_path, 'r') as file:
        content = file.read()

    # Tokenize into sentences
    sentences = sent_tokenize(content)

    # Preprocess sentences
    clean_sentences = [preprocess_text(sentence) for sentence in sentences]

    # Create the TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(clean_sentences)

    # Compute sentence scores (sum of TF-IDF values)
    sentence_scores = tfidf_matrix.sum(axis=1).flatten().tolist()[0]

    # Rank sentences by their scores
    ranked_sentences = np.argsort(sentence_scores)[::-1]

    # Extract top-ranked sentences
    summary = [sentences[idx] for idx in ranked_sentences[:num_sentences]]
    return ' '.join(summary)

# Example Usage
file_path = ""  # Replace with path
summary = summarize_text(file_path)
print("Summary:")
print(summary)


In [ ]:
import zipfile
import os

# Unzip the uploaded file
zip_path = "/content/bbcnews.zip"  # Replace with the actual uploaded file name if different
extract_path = "/bbcnews"  # Path to extract the files

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Navigate to the desired folders
articles_path = os.path.join(extract_path, "BBC News Summary", "News Articles")
summaries_path = os.path.join(extract_path, "BBC News Summary", "Summaries")

# Verify the folders and files
print("News Articles Folders:", os.listdir(articles_path))
print("Summaries Folders:", os.listdir(summaries_path))

In [ ]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Define paths
articles_path = "/bbcnews/BBC News Summary/News Articles/sport"  # Folder containing the articles
summaries_path = "/bbcnews/BBC News Summary/Summaries/sport"  # Folder containing the corresponding summaries

# Preprocess text
def preprocess_text(text):
    # Remove special characters, convert to lowercase, and tokenize
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Load data
articles = []
summaries = []
file_ids = []

for filename in sorted(os.listdir(articles_path)):
    if filename.endswith(".txt"):
        file_id = filename.split(".")[0]
        file_ids.append(file_id)

        # Read and preprocess articles
        with open(os.path.join(articles_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            articles.append(preprocess_text(file.read()))

        # Read and preprocess summaries
        with open(os.path.join(summaries_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            summaries.append(preprocess_text(file.read()))

# Create TF-IDF representations
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(articles)

# Extract important sentences for summaries
def extract_summary(article, num_sentences=3):
    sentences = nltk.sent_tokenize(article)  # Split into sentences
    sentence_vectors = tfidf_vectorizer.transform(sentences)
    # Compute similarity scores between sentences and article
    similarity_scores = cosine_similarity(sentence_vectors, tfidf_matrix)
    # Sort sentences by importance
    top_indices = np.argsort(similarity_scores.flatten())[::-1][:num_sentences]
    return ' '.join([sentences[i] for i in sorted(top_indices)])

# Train a simple regression model to map TF-IDF features to summary vectors
summary_tfidf = tfidf_vectorizer.transform(summaries)
model = LinearRegression()
model.fit(tfidf_matrix, summary_tfidf.toarray())

# Test the model on preprocessed Reddit post content
def summarize_reddit_post(content):
    # TF-IDF representation of the preprocessed content
    post_tfidf = tfidf_vectorizer.transform([content])
    predicted_summary_vector = model.predict(post_tfidf)

    # Tokenize the content into sentences
    sentences = nltk.sent_tokenize(content)
    if len(sentences) == 0:  # If content is empty or has no sentences
        return "No summary could be generated."

    # Compute similarity scores between sentences and predicted summary vector
    sentence_vectors = tfidf_vectorizer.transform(sentences)
    similarity_scores = cosine_similarity(sentence_vectors, predicted_summary_vector)

    # Select the top 3 most relevant sentences
    top_indices = np.argsort(similarity_scores.flatten())[::-1][:3]
    return ' '.join([sentences[i] for i in sorted(top_indices)])

# PRAW from Reddit

In [ ]:
!pip install praw

In [ ]:
import praw

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="",           # use user's id
    client_secret="",       # use user's secret
    user_agent=""           # use user's agent specification
)

In [ ]:
# Fetch posts from the "sports" subreddit
subreddit = reddit.subreddit("sports")
posts = []
for post in subreddit.hot(limit=10):  # Adjust the limit as needed
    # Fetch top-level comments
    comments = []
    post.comments.replace_more(limit=0)  # Load all top-level comments
    for comment in post.comments.list():
        comments.append(comment.body)
        if len(comments) >= 5:  # Limit to top 5 comments for brevity
            break

    posts.append({
        "title": post.title,
        "selftext": post.selftext.strip(),
        "url": post.url,
        "comments": comments
    })

# Display the posts
for idx, post in enumerate(posts):
    print(f"Post {idx + 1}: {post['title']}")
    if post['selftext']:
        print(f"Content: {post['selftext']}")
    else:
        print("Content: [No text content available]")
        print(f"Using comments: {post['comments']}")
    print(f"URL: {post['url']}")
    print()

# Test on Reddit

In [ ]:
# Summarize Reddit posts
import nltk
nltk.download('punkt_tab')
def process_reddit_posts(posts):
    summarized_posts = []

    for post in posts:
        # Use content, comments, or title as input
        content = post["selftext"]
        if not content.strip():  # If no content, use comments
            content = ' '.join(post.get("comments", []))
        if not content.strip():  # If no comments, use the title
            content = post["title"]

        # Preprocess the content
        preprocessed_content = preprocess_text(content)

        # Summarize using the trained model
        summary = summarize_reddit_post(preprocessed_content)

        # Store the summarized post
        summarized_posts.append({
            "title": post["title"],
            "summary": summary,
            "url": post["url"]
        })

    return summarized_posts

# Summarize the fetched Reddit posts
summarized_reddit_posts = process_reddit_posts(posts)

# Display the summaries
for idx, post in enumerate(summarized_reddit_posts):
    print(f"Post {idx + 1}: {post['title']}")
    print(f"Summary: {post['summary']}")
    print(f"URL: {post['url']}")
    print()