# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
nasa = feedparser.parse('https://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss')

In [3]:
type(nasa)

feedparser.util.FeedParserDict

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
nasa.keys()

dict_keys(['bozo', 'entries', 'feed', 'headers', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
nasa['feed'].keys()

dict_keys(['language', 'title', 'title_detail', 'subtitle', 'subtitle_detail', 'links', 'link', 'authors', 'author', 'author_detail', 'publisher', 'publisher_detail', 'docs'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
# Definir la URL del feed
url = 'https://www.nasa.gov/rss/dyn/lg_image_of_the_day.rss'

# Descargar el feed y analizarlo
feed = feedparser.parse(url)

# Verificar si el atributo existe antes de imprimirlo
if 'title' in feed.feed:
    print('Título:', feed.feed.title)
else:
    print('El feed no tiene título.')

if 'subtitle' in feed.feed:
    print('Subtítulo:', feed.feed.subtitle)
else:
    print('El feed no tiene subtítulo.')

if 'author' in feed.feed:
    print('Autor:', feed.feed.author)
else:
    print('El feed no tiene autor.')

if 'link' in feed.feed:
    print('Enlace:', feed.feed.link)
else:
    print('El feed no tiene enlace.')

Título: NASA Image of the Day
Subtítulo: The latest NASA "Image of the Day" image.
Autor: brian.dunbar@nasa.gov
Enlace: http://www.nasa.gov/


### 5. Count the number of entries that are contained in this RSS feed.

In [7]:
len(nasa["feed"])

13

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [8]:
# Download and parse the feed
feed = feedparser.parse(url)

# Get one of the entries in the feed
entry = feed.entries[0]

# Get the list of keys for the entry
keys = entry.keys()

# Print the list of keys
print("Keys available for the entry:", keys)

Keys available for the entry: dict_keys(['title', 'title_detail', 'links', 'link', 'summary', 'summary_detail', 'id', 'guidislink', 'published', 'published_parsed', 'source'])


### 7. Extract a list of entry titles.

In [9]:
# Extract the list of entry titles
titles = []
for entry in feed.entries:
    titles.append(entry.title)
    
titles

['Astronaut Kjell Lindgren',
 'Hubble Captures an Elusive Galaxy Cluster',
 'The Cygnus space freighter is released from the Canadarm2 robotic arm',
 'Celebrating Earth Day',
 'SuperBIT Sees Colliding Antennae Galaxies',
 "NASA's New 3D-Printed Superalloy Can Take the Heat",
 'Martian Milestone for Ingenuity',
 'Katmai National Park',
 'Astronaut Jessica Watkins',
 'The Himalayas and Mount Everest in Nepal',
 'Bald Eagle Lands at Kennedy Space Center',
 'Hubble Spots a Galaxy with Tendrils',
 'Heavy Rain, Snow Revive Tulare Lake',
 'Celebrating Science at the White House Easter Egg Roll',
 'X-59 Gets Its Tail',
 'Schulz, Snoopy Visit NASA Headquarters',
 'Battling Bots',
 'Artemis II Crew Appears on "The Late Show"',
 'Artemis II Crew Revealed',
 'Patagonian Plankton Swirls',
 'NASA’s Crawler Transporter 2 Sets Record',
 'Crew-4’s Museum Field Trip',
 'Earth’s Radiant Atmosphere',
 'Working on Artemis II',
 'Celebrating Women in STEM',
 'Space Station Star Trail',
 'Photo Archivist and

### 8. Calculate the percentage of "Four short links" entry titles.

In [10]:
# Count the number of "Four short links" entry titles
num_fsl = 0
for entry in feed.entries:
    if "Four short links" in entry.title:
        num_fsl += 1

# Calculate the percentage of "Four short links" entry titles
percent_fsl = num_fsl / len(feed.entries) * 100

# Print the result
print("Percentage of 'Four short links' entry titles: {:.2f}%".format(percent_fsl))

# No se puede obtener

Percentage of 'Four short links' entry titles: 0.00%


### 9. Create a Pandas data frame from the feed's entries.

In [11]:
import pandas as pd

In [12]:
# Extract the desired attributes from the entries
data = []
for entry in feed.entries:
    title = entry.title
    link = entry.link
    date = entry.published
    data.append([title, link, date])

# Create a Pandas data frame
df = pd.DataFrame(data, columns=['Title', 'Link', 'Published'])

df.head()

Unnamed: 0,Title,Link,Published
0,Astronaut Kjell Lindgren,http://www.nasa.gov/faces-of-nasa/kjell-lindgren,"Mon, 01 May 2023 10:50 EDT"
1,Hubble Captures an Elusive Galaxy Cluster,http://www.nasa.gov/image-feature/goddard/2023...,"Fri, 28 Apr 2023 07:00 EDT"
2,The Cygnus space freighter is released from th...,http://www.nasa.gov/image-feature/the-cygnus-s...,"Thu, 27 Apr 2023 10:07 EDT"
3,Celebrating Earth Day,http://www.nasa.gov/image-feature/celebrating-...,"Mon, 24 Apr 2023 12:55 EDT"
4,SuperBIT Sees Colliding Antennae Galaxies,http://www.nasa.gov/image-feature/superbit-see...,"Thu, 20 Apr 2023 12:07 EDT"


### 10. Count the number of entries per author and sort them in descending order.

In [13]:
count_entries = df["Title"].value_counts().sort_values()
count_entries.head()
# al no tenner "author" en mi df elijo filtrar por "Title"

Astronaut Kjell Lindgren                                                 1
X-59 Gets Its Tail                                                       1
The Cygnus space freighter is released from the Canadarm2 robotic arm    1
Celebrating Earth Day                                                    1
SuperBIT Sees Colliding Antennae Galaxies                                1
Name: Title, dtype: int64

### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [14]:
# Create a list of dictionaries that contain the data for each entry
data = []
for entry in feed.entries:
    data.append({
        'title': entry.get('title', 'Unknown'),
        'author': entry.get('author', 'Unknown'),
        'title_length': len(entry.get('title', 'Unknown'))
    })

# Convert the list of dictionaries to a Pandas data frame
df = pd.DataFrame(data)

# Sort the data frame by the title length in descending order
df = df.sort_values(by='title_length', ascending=False)

df.head()

Unnamed: 0,title,author,title_length
44,NASA Celebrates National Engineers Week: Persp...,Unknown,72
2,The Cygnus space freighter is released from th...,Unknown,69
37,Deputy Associate Administrator for STEM Engage...,Unknown,61
13,Celebrating Science at the White House Easter ...,Unknown,54
5,NASA's New 3D-Printed Superalloy Can Take the ...,Unknown,50


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [15]:
# Create an empty list to hold the titles
machine_learning_titles = []

# Loop over the entries and check if the phrase "machine learning" is in the summary
for entry in feed.entries:
    if 'machine learning' in entry.summary.lower():
        machine_learning_titles.append(entry.title)

machine_learning_titles

[]