In [None]:
# Fetching Data From an API | 
# --------------------------------------------
#
# 1. What is an API?
# - API stands for Application Programming Interface.
# - It allows two software systems to communicate and share data.
# - APIs provide structured and real-time data access from web services.
# - Example: Booking train tickets, checking availability via railway API.
#
# 2. When to use APIs for Data:
# - APIs offer reliable, structured data sources.
# - Preferred over web scraping when available.
# - If API is unavailable, then fallback is web scraping.
#
# 3. Example API used: TMDB (The Movie Database) API
# - Used to fetch movie data such as titles, ratings, release dates.
# - Registration is required to get an API key.
#
# 4. Steps to Fetch Data Using an API:
# - Register and get an API key.
# - Understand API endpoints (URLs for specific data requests).
# - Use Python libraries like 'requests' to send HTTP GET requests.
# - Parse JSON responses to extract needed data fields.
# - Convert extracted data into pandas DataFrame for analysis.
#
# 5. Code Demonstration Highlights:
# - Sending requests to TMDB API to get top-rated movies.
# - Extracting specific fields like movie title, release date, popularity, vote count.
# - Handling pagination by looping through multiple pages to collect larger datasets.
# - Creating a consolidated DataFrame combining all fetched data.
#
# 6. Uploading Data:
# - After fetching and storing, datasets can be uploaded to platforms like Kaggle.
# - Enables sharing data professionally for collaboration and competitions.
#
# 7. Practical Tips:
# - Always check API limits and usage policies.
# - Use error handling for failed requests or API timeouts.
# - Validate and clean data after extraction.
# - Explore API documentation thoroughly for available endpoints and parameters.
#
# 8. Summary:
# - APIs are a crucial and efficient data source for machine learning.
# - Using APIs saves time and ensures better data quality than scraping.
# - Learning to work with APIs prepares practitioners for real-world data collection.
#
# -- End of Notes --


In [38]:
import pandas as pd
import requests

In [41]:
conn = requests.get("https://api.themoviedb.org/3/movie/top_rated?api_key=<<your api key>>&&language=en-US&page=1")

In [42]:
conn

<Response [200]>

In [43]:
pd.DataFrame(conn.json()['results']).head(2)
#converting into dataframes

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/pNjh59JSxChQktamG3LMp9ZoQzp.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,31.3131,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.712,28861
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",26.8325,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.685,21818


In [44]:
df= pd.DataFrame(conn.json()['results'])[['id','title','overview',
                                              'release_date','popularity',
                                              'vote_average','vote_count']]
#converting into dataframes

In [45]:
df.head()

Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,278,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,1994-09-23,31.3131,8.712,28861
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,26.8325,8.685,21818
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,16.5703,8.571,13178
3,424,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,12.6319,8.566,16688
4,389,12 Angry Men,The defense and the prosecution have rested an...,1957-04-10,13.1417,8.5,9383


In [46]:
df = pd.DataFrame()

In [47]:
df

In [48]:
for i in range(1,429):
    conn = requests.get("https://api.themoviedb.org/3/movie/top_rated?api_key=<<your api key>>&&language=en-US&page={}".format(i))
    temp_df= pd.DataFrame(conn.json()['results'])[['id','title','overview','release_date','popularity','vote_average','vote_count']]
    df = pd.concat([df, temp_df], ignore_index=True)

In [50]:
df.shape

(8560, 7)

In [51]:
df.to_csv('movies1.csv')