# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser

### 1. Use feedparser to parse the following RSS feed URL.

In [3]:
NASA = feedparser.parse('https://www.nasa.gov/rss/dyn/earth.rss')

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
NASA.keys()

dict_keys(['bozo', 'entries', 'feed', 'headers', 'href', 'status', 'encoding', 'version', 'namespaces'])

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
NASA.feed.keys()

dict_keys(['language', 'title', 'title_detail', 'subtitle', 'subtitle_detail', 'links', 'link', 'authors', 'author', 'author_detail', 'publisher', 'publisher_detail', 'docs'])

### 4. Extract and print the feed title, subtitle, author, and link.

In [11]:

print(NASA.feed.title + "\n" + NASA.feed.subtitle + "\n" + NASA.feed.author + "\n" + NASA.feed.link)


Earth News
A RSS news feed containing the latest NASA press releases on Earth-observing missions.
jim.wilson@nasa.gov
http://www.nasa.gov/


### 5. Count the number of entries that are contained in this RSS feed.

In [12]:
len(NASA.keys())

9

### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [25]:
NASA.entries[8].keys()

dict_keys(['title', 'title_detail', 'links', 'link', 'summary', 'summary_detail', 'id', 'guidislink', 'published', 'published_parsed', 'source'])

### 7. Extract a list of entry titles.

In [31]:
NASA_titles =[entry.title for entry in NASA.entries]
list(NASA_titles)

['NASA Experts Available for Interviews About Sea and Sky Campaign',
 'NASA, Rocket Lab Set Coverage for Tropical Cyclones Mission',
 'NASA’s High-Resolution Air Quality Control Instrument Launches',
 'NASA Sets Coverage for Air Quality Instrument Launch',
 'NASA to Host Media Call on Upcoming Air Quality Satellite Launch',
 'NASA Selects L3Harris to Develop Imager for NOAA Satellite',
 'NASA to Participate in Aerospace Conference, Discuss New Collaboration',
 'La NASA afirma que 2022 es el quinto año más cálido registrado',
 'NASA Says 2022 Fifth Warmest Year on Record, Warming Trend Continues',
 'NASA, NOAA to Announce 2022 Global Temperatures, Climate Conditions']

### 8. Calculate the percentage of "Four short links" entry titles.

In [97]:
FSL_count = 0
total_entries = len(NASA.entries)

for entry in NASA.entries:
     if "Four Short Links" in entry.title:
        FSL_count += 1

percentage = (FSL_count/total_entries) *100

percentage

0.0

### 9. Create a Pandas data frame from the feed's entries.

In [56]:
import pandas as pd


NASAframe = pd.DataFrame(NASA.entries)

NASAframe.head(5)

Unnamed: 0,title,title_detail,links,link,summary,summary_detail,id,guidislink,published,published_parsed,source
0,NASA Experts Available for Interviews About Se...,"{'type': 'text/plain', 'language': 'en', 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://www.nasa.gov/press-release/nasa-experts...,"This spring, NASA’s S-MODE (Sub-Mesoscale Ocea...","{'type': 'text/html', 'language': 'en', 'base'...",http://www.nasa.gov/press-release/nasa-experts...,False,"Wed, 26 Apr 2023 14:09 EDT","(2023, 4, 26, 18, 9, 0, 2, 116, 0)",{'href': 'http://www.nasa.gov/rss/dyn/earth.rs...
1,"NASA, Rocket Lab Set Coverage for Tropical Cyc...","{'type': 'text/plain', 'language': 'en', 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://www.nasa.gov/press-release/nasa-rocket-...,"NASA and Rocket Lab are targeting 9 p.m. EDT, ...","{'type': 'text/html', 'language': 'en', 'base'...",http://www.nasa.gov/press-release/nasa-rocket-...,False,"Mon, 24 Apr 2023 15:18 EDT","(2023, 4, 24, 19, 18, 0, 0, 114, 0)",{'href': 'http://www.nasa.gov/rss/dyn/earth.rs...
2,NASA’s High-Resolution Air Quality Control Ins...,"{'type': 'text/plain', 'language': 'en', 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://www.nasa.gov/press-release/nasa-s-high-...,A NASA instrument to provide unprecedented res...,"{'type': 'text/html', 'language': 'en', 'base'...",http://www.nasa.gov/press-release/nasa-s-high-...,False,"Fri, 07 Apr 2023 00:30 EDT","(2023, 4, 7, 4, 30, 0, 4, 97, 0)",{'href': 'http://www.nasa.gov/rss/dyn/earth.rs...
3,NASA Sets Coverage for Air Quality Instrument ...,"{'type': 'text/plain', 'language': 'en', 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://www.nasa.gov/press-release/nasa-sets-co...,NASA and SpaceX are targeting no earlier than ...,"{'type': 'text/html', 'language': 'en', 'base'...",http://www.nasa.gov/press-release/nasa-sets-co...,False,"Thu, 06 Apr 2023 08:24 EDT","(2023, 4, 6, 12, 24, 0, 3, 96, 0)",{'href': 'http://www.nasa.gov/rss/dyn/earth.rs...
4,NASA to Host Media Call on Upcoming Air Qualit...,"{'type': 'text/plain', 'language': 'en', 'base...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://www.nasa.gov/press-release/nasa-to-host...,Media are invited to a NASA media teleconferen...,"{'type': 'text/html', 'language': 'en', 'base'...",http://www.nasa.gov/press-release/nasa-to-host...,False,"Fri, 31 Mar 2023 10:04 EDT","(2023, 3, 31, 14, 4, 0, 4, 90, 0)",{'href': 'http://www.nasa.gov/rss/dyn/earth.rs...


In [68]:
NASAframe.columns






Index(['title', 'title_detail', 'links', 'link', 'summary', 'summary_detail',
       'id', 'guidislink', 'published', 'published_parsed', 'source'],
      dtype='object')

In [None]:
#NASAframe['date_column'] = pd.to_datetime(NASAframe['date_column'].str.replace(' EDT', ''), format='%a, %d %b %Y %H:%M')


In [72]:
#from dateutil import parser

In [71]:
#NASAframe['published'] = NASAframe['published'].apply(parser.parse)

In [73]:
s = 'Wed, 26 Apr 2023 14:09 EDT'

In [79]:
l= list(s.split()[1:-1])

type(l)

list

In [86]:
' '.join(l)

'26 Apr 2023 14:09'

In [91]:
NASAframe['published'] = NASAframe['published'].apply(lambda x: ' '.join(x.split()[:-1]))

AttributeError: 'Timestamp' object has no attribute 'split'

### 10. Count the number of entries per author and sort them in descending order.

In [92]:
published = NASAframe.groupby('published', as_index=False).agg({'title':'count'})


published.columns = ['published', 'entries']
published.sort_values('published', ascending=False)

Unnamed: 0,published,entries
9,2023-04-26 14:09:00,1
8,2023-04-24 15:18:00,1
7,2023-04-07 00:30:00,1
6,2023-04-06 08:24:00,1
5,2023-03-31 10:04:00,1
4,2023-03-13 14:46:00,1
3,2023-01-19 16:50:00,1
2,2023-01-12 10:35:00,1
1,2023-01-12 09:46:00,1
0,2023-01-10 10:22:00,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [93]:
NASAframe['title_length'] = NASAframe['title'].apply(len)
NASAframe[['title', 'title_length', 'published']].sort_values('title_length', ascending=False).head()

Unnamed: 0,title,title_length,published
6,"NASA to Participate in Aerospace Conference, D...",70,2023-01-19 16:50:00
8,"NASA Says 2022 Fifth Warmest Year on Record, W...",68,2023-01-12 09:46:00
9,"NASA, NOAA to Announce 2022 Global Temperature...",67,2023-01-10 10:22:00
0,NASA Experts Available for Interviews About Se...,64,2023-04-26 14:09:00
4,NASA to Host Media Call on Upcoming Air Qualit...,64,2023-03-31 10:04:00


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [101]:
Air_count = []
total_entries = len(NASA.entries)

for entry in NASA.entries:
    if "Air Quality" in entry.title:
        Air_count.append(entry.title)

Air_count

['NASA’s High-Resolution Air Quality Control Instrument Launches',
 'NASA Sets Coverage for Air Quality Instrument Launch',
 'NASA to Host Media Call on Upcoming Air Quality Satellite Launch']