<a href="https://colab.research.google.com/github/Mercymerine/Capstone_Movie_Recommendation_System/blob/main/main_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=9709d7ee0252bdab08dc3834565e67fee9fe100b969fc0b09c7b164ca4bf4128
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [4]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [9]:
import feedparser
import pandas as pd
import csv
import schedule
import time
from bs4 import BeautifulSoup
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# RSS Feed URL
RSS_URL = 'https://rss.app/feeds/atZtRJTsJwJI7KSQ.xml'

# Function to fetch RSS data
def fetch_rss():
    try:
        feed = feedparser.parse(RSS_URL)
        if not feed.entries:
            print("No entries found in the RSS feed.")
            return

        with open('rss_feed.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)

            # Write the header if the file is empty
            if csvfile.tell() == 0:
                writer.writerow(['Date', 'Title', 'Author', 'Summary', 'Category', 'Link'])

            for entry in feed.entries:
                date = entry.published.split('T')[0] if 'published' in entry else 'N/A'
                title = entry.title if 'title' in entry else 'N/A'
                author = entry.author if 'author' in entry else 'N/A'
                summary = entry.summary if 'summary' in entry else 'N/A'
                soup = BeautifulSoup(summary, 'html.parser')
                summary_text = soup.get_text()

                categories = ', '.join([cat.term for cat in entry.tags]) if 'tags' in entry else 'N/A'
                link = entry.link if 'link' in entry else 'N/A'

                writer.writerow([date, title, author, summary_text, categories, link])

        print(f"RSS feed fetched and written to CSV at {time.strftime('%Y-%m-%d %H:%M:%S')}")

    except Exception as e:
        print(f"An error occurred while fetching the RSS feed: {e}")

# Function to process the RSS data
def process_rss_data():
    rss = pd.read_csv('rss_feed.csv')  # Load the most recent data
    rss['Date'] = pd.to_datetime(rss['Date'], errors='coerce')

    # Clean and preprocess the DataFrame
    rss.dropna(subset=['Title'], inplace=True)
    rss['Summary'] = rss['Summary'].astype(str)
    rss['Text'] = rss['Summary'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    rss.drop(['Summary'], axis=1, inplace=True)
    rss.drop_duplicates(inplace=True)

    # Replace missing values
    rss.fillna({
        'Author': 'Unknown',
        'Link': 'Not Found',
        'Text': 'Text Not Found',
        'Title': 'Title Not Found',
        'Date': 'Date Not Found'
    }, inplace=True)

    # Create a combined text column
    rss['combined_text'] = rss['Title'] + ' ' + rss['Text']

    # Process text for NLP
    rss['combined_text'] = rss['combined_text'].apply(lambda x: preprocess_text(x))

    return rss

# Text preprocessing function
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)


# Function to convert text to vectors using TF-IDF
def convert_to_vectors(df):
    """Converts the 'combined_text' column to TF-IDF vectors."""
    tfidf = TfidfVectorizer()
    vectors = tfidf.fit_transform(df['combined_text'])
    return vectors, tfidf

# Function to perform Faiss search (placeholder)
def faiss_search(vectors, query_vector, top_n=5):
    """Placeholder for Faiss search. Replace with actual Faiss implementation."""
    # This is a placeholder; you need to install Faiss and implement the search logic
    # For now, it returns dummy results
    D = np.array([[0.1, 0.2, 0.3, 0.4, 0.5]]) # Example distances
    I = np.array([[0, 1, 2, 3, 4]])  # Example indices
    return I, D

# Function to recommend articles based on processed data
def recommend_articles(df, query, top_n=5):
    # Assuming a function to convert to vectors and normalize recency is defined
    vectors, tfidf = convert_to_vectors(df)  # Implement this function as needed
    query_vector = tfidf.transform([query]).toarray()

    I, D = faiss_search(vectors, query_vector, top_n=top_n)  # Implement this function as needed

    results = []
    seen_titles = set()

    for idx in I[0]:
        row = df.iloc[idx]
        final_score = 1 / (1 + D[0][I[0].tolist().index(idx)])  # Example scoring
        title = row['combined_text']
        if title not in seen_titles:
            results.append((row['Link'], title, row['Author'], final_score))
            seen_titles.add(title)

    return sorted(results, key=lambda x: x[3], reverse=True)[:top_n]

# Main function to automate the process
def main():
    fetch_rss()  # Fetch new data
    processed_data = process_rss_data()  # Process the new data
    results = recommend_articles(processed_data, query='your_query_here', top_n=5)  # Example query

    # Print the results
    for link, text, author, score in results:
        print(f"Link: {link}, Title: {text}, Author: {author}")

# Schedule the pipeline to run every 5 minutes
schedule.every(5).minutes.do(main)

# Run the scheduled task
while True:
    schedule.run_pending()
    time.sleep(1)


RSS feed fetched and written to CSV at 2024-10-10 16:40:19
Link: https://abcnews.go.com/US/video/clearwater-fl-mayor-latest-concerns-amidst-hurricane-milton-114670000, Title: video clearwater fl mayor late concern amidst hurricane milton abc news linsey davis speak clearwater mayor bruce rector unpack big concern milton make landfall immediate step storm condition slow, Author: ABC News
Link: https://www.cnn.com/2024/10/10/politics/ethel-kennedy-dies/index.html, Title: ethel kennedy human right activist widow robert f. kennedy die 96 ethel kennedy widow robert f. kennedy die family announce thursday 96, Author: Karl de Vries
Link: https://www.washingtonpost.com/obituaries/2024/10/10/rfk-widow-ethel-kennedy-dies/, Title: ethel kennedy widow robert kennedy family matriarch die 96 scarred tragedy devote slain husband memory home salon official washington, Author: Matt Schudel
Link: https://www.nytimes.com/2024/10/10/us/politics/ethel-kennedy-dead.html, Title: ethel kennedy passionate supp

KeyboardInterrupt: 