<a href="https://colab.research.google.com/github/Mercymerine/Capstone_Movie_Recommendation_System/blob/main/main_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=a04ed19df1e76ecbc1e6d737116eec31c8bea2cce6c2625bb00ddd3a400cfa91
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [4]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [9]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import feedparser
import pandas as pd
import csv
import schedule
import time
from bs4 import BeautifulSoup
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# RSS Feed URL
RSS_URL = 'https://rss.app/feeds/atZtRJTsJwJI7KSQ.xml'

# Function to fetch RSS data
def fetch_rss():
    try:
        feed = feedparser.parse(RSS_URL)
        if not feed.entries:
            print("No entries found in the RSS feed.")
            return

        with open('rss_feed.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)

            # Write the header if the file is empty
            if csvfile.tell() == 0:
                writer.writerow(['Date', 'Title', 'Author', 'Summary', 'Category', 'Link'])

            for entry in feed.entries:
                date = entry.published.split('T')[0] if 'published' in entry else 'N/A'
                title = entry.title if 'title' in entry else 'N/A'
                author = entry.author if 'author' in entry else 'N/A'
                summary = entry.summary if 'summary' in entry else 'N/A'
                soup = BeautifulSoup(summary, 'html.parser')
                summary_text = soup.get_text()

                categories = ', '.join([cat.term for cat in entry.tags]) if 'tags' in entry else 'N/A'
                link = entry.link if 'link' in entry else 'N/A'

                writer.writerow([date, title, author, summary_text, categories, link])

        print(f"RSS feed fetched and written to CSV at {time.strftime('%Y-%m-%d %H:%M:%S')}")

    except Exception as e:
        print(f"An error occurred while fetching the RSS feed: {e}")

# Function to process the RSS data
def process_rss_data():
    rss = pd.read_csv('rss_feed.csv')  # Load the most recent data
    rss['Date'] = pd.to_datetime(rss['Date'], errors='coerce')

    # Clean and preprocess the DataFrame
    rss.dropna(subset=['Title'], inplace=True)
    rss['Summary'] = rss['Summary'].astype(str)
    rss['Text'] = rss['Summary'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    rss.drop(['Summary'], axis=1, inplace=True)
    rss.drop_duplicates(inplace=True)

    # Replace missing values
    rss.fillna({
        'Author': 'Unknown',
        'Link': 'Not Found',
        'Text': 'Text Not Found',
        'Title': 'Title Not Found',
        'Date': 'Date Not Found'
    }, inplace=True)

    # Create a combined text column
    rss['combined_text'] = rss['Title'] + ' ' + rss['Text']

    # Process text for NLP
    rss['combined_text'] = rss['combined_text'].apply(lambda x: preprocess_text(x))

    return rss

# Text preprocessing function
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)


# Function to convert text to vectors using TF-IDF
def convert_to_vectors(df):
    """Converts the 'combined_text' column to TF-IDF vectors."""
    tfidf = TfidfVectorizer()
    vectors = tfidf.fit_transform(df['combined_text'])
    return vectors, tfidf

# Function to perform Faiss search
def faiss_search(vectors, query_vector, tfidf, top_n=10):
    """Performs Faiss search and returns the top_n results."""
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors.toarray().astype('float32'))

   # Transform the query using the same TF-IDF vectorizer
    query_vector = tfidf.transform([query_vector]).toarray().astype('float32') # Transform the query
    D, I = index.search(query_vector, top_n)

    return I, D

# Function to recommend articles based on processed data
def recommend_articles(df, query, top_n=10):
    query_processed = preprocess_text(query) # Preprocess the query
    vectors, tfidf = convert_to_vectors(df)

    I, D = faiss_search(vectors, query_processed, tfidf, top_n)

    results = []
    seen_titles = set()

    for idx in I[0]:
        row = df.iloc[idx]
        final_score = 1 / (1 + D[0][I[0].tolist().index(idx)])  # Example scoring
        title = row['combined_text']
        if title not in seen_titles:
            results.append((row['Link'], title, row['Author'], final_score))
            seen_titles.add(title)

    return sorted(results, key=lambda x: x[3], reverse=True)[:top_n]

# Main function to automate the process
def main():
    fetch_rss()  # Fetch new data
    processed_data = process_rss_data()  # Process the new data
    results = recommend_articles(processed_data, query='Kenya', top_n=5)  # Example query

    # Print the results
    for link, text, author, score in results:
        print(f"Link: {link}, Title: {text}, Author: {author}")

# Schedule the pipeline to run every 5 minutes
schedule.every(60).minutes.do(main)

# Run the scheduled task
while True:
    schedule.run_pending()
    time.sleep(1)


RSS feed fetched and written to CSV at 2024-10-11 16:22:39


  rss['Date'] = pd.to_datetime(rss['Date'], errors='coerce')


Link: https://www.usatoday.com/story/news/politics/elections/2024/10/11/obama-calls-out-black-men-kamala-harris-support/75625647007/, Title: obama call black man hesitance harris think sit president barack obama call oublack man reluctant support kamala harris suggest woman, Author: Joey Garrison
Link: https://www.theledger.com/story/weather/hurricane/2024/10/11/lakeland-electric-says-it-could-be-7-days-duke-outages-grow-in-polk/75625587007/, Title: power restore polk lakeland electric make steady progress restore power say restoration seven day duke power outage grow polk county, Author: The Ledger
Link: https://www.theguardian.com/us-news/live/2024/oct/11/trump-harris-us-elections-obama-latest-updates, Title: obama take trump lie fake strength urge man vote harris president scoff idea trump bullying show strength speech condemn hurricane lie ask ok, Author: the Guardian
Link: https://www.aljazeera.com/news/2024/10/11/israeli-forces-again-target-un-peacekeepers-in-southern-lebanon, Ti