## Imports

In [1]:
from bs4 import BeautifulSoup as bs
import requests

import pandas as pd

## Getting urls and genres to search

To change the genre of the contents, we just have to add something in genre. 
Example: 

movie_genre = 'comedy'

In [2]:
movie_genre = ''
serie_genre = ''
game_genre = ''

url_serie = 'https://www.imdb.com/search/title/?title_type=tv_series&release_date=2015-01-01,2020-12-31&genres='+serie_genre+'&sort=num_votes,desc&count=250&start=0'
url_movie = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2015-01-01,2020-12-31&genres='+movie_genre+'&sort=num_votes,desc&count=250&start=0'
url_game = 'https://www.imdb.com/search/title/?title_type=video_game&release_date=2015-01-01,2019-12-31&genres='+game_genre+'&sort=num_votes,desc&count=250&start=0'

## Requisitons and parsing to BeautifulSoup

In [3]:
movies_req = requests.get(url_movie)
series_req = requests.get(url_serie)
games_req = requests.get(url_game)

In [4]:
soup_movie = bs(movies_req.text, 'lxml')
soup_serie = bs(series_req.text, 'lxml')
soup_game = bs(games_req.text, 'lxml')

In [5]:
movie_contents = soup_movie.find_all('div', class_ = 'lister-item mode-advanced')
serie_contents = soup_serie.find_all('div', class_ = 'lister-item mode-advanced')
game_contents = soup_game.find_all('div', class_ = 'lister-item mode-advanced')

Here i'm searching what is the name of the classes i gonna to search and testing what is the results

In [6]:
test = soup_game.find('div', class_ = 'lister-item mode-advanced')

In [7]:
#Here is only for tests
test.h3.a.text
test.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
test.strong.text
test.find('span', attrs = {'name':'nv'}).text
test.find('span', attrs = {'class':'certificate'}).text
test.find('span', attrs = {'class':'genre'}).text.strip()

'Action, Adventure, Drama'

## Getting the data from the requisitions and adding to an array with the name of the entertainment i'm adding

In [8]:
movies_names = []
movies_years = []
movies_imdb_ratings = []
movies_metascores = []
movies_votes = []
movies_ages = []
movies_runtimes = []
movies_genres = []

for content in movie_contents:
    if content.find('span', attrs = {'class':'certificate'}) is not None:
    # The name of the movie
        movies_name = content.h3.a.text
        movies_names.append(movies_name)
    # The year of the movie
        movies_year = content.h3.find('span', class_ = 'lister-item-year').text
        if movies_year[1].isdigit() == False:
            movies_year = movies_year.strip('(I) (')
        else:
            movies_year = movies_year.strip('()')
        movies_years.append(movies_year)
    # The rating of the movie
        movies_imdb = float(content.strong.text)
        movies_imdb_ratings.append(movies_imdb)
    # The number of votes of the movie
        movies_vote = content.find('span', attrs = {'name':'nv'})['data-value']
        movies_votes.append(int(movies_vote))
    #the Age rating of the movie
        movies_age_rating = content.find('span', attrs = {'class':'certificate'}).text
        movies_ages.append(movies_age_rating)
    #The runtime of the movie
        movies_runtime = content.find('span', attrs = {'class':'runtime'}).text.strip(' min')
        movies_runtimes.append(movies_runtime)
    #The genres of the movie
        movies_genre = content.find('span', attrs = {'class':'genre'}).text.strip()
        movies_genres.append(movies_genre)

In [9]:
series_names = []
series_years = []
series_imdb_ratings = []
series_metascores = []
series_votes = []
series_ages = []
series_runtimes = []
series_genres = []

for content in serie_contents:
    if content.find('span', attrs = {'class':'certificate'}) is not None and content.find('span', attrs = {'class':'runtime'}) is not None:
    # The name of the serie
        series_name = content.h3.a.text
        series_names.append(series_name)
    # The year of the serie
        series_year = content.h3.find('span', class_ = 'lister-item-year').text
        series_years.append(series_year)
    # The rating of the serie
        series_imdb = float(content.strong.text)
        series_imdb_ratings.append(series_imdb)
    # The number of votes of the serie
        series_vote = content.find('span', attrs = {'name':'nv'})['data-value']
        series_votes.append(int(series_vote))
    #the Age rating of the serie
        series_age_rating = content.find('span', attrs = {'class':'certificate'}).text
        series_ages.append(series_age_rating)
    #The runtime of the serie
        series_runtime = content.find('span', attrs = {'class':'runtime'}).text.strip(' min')
        series_runtimes.append(series_runtime)
    #The genres of the serie
        series_genre = content.find('span', attrs = {'class':'genre'}).text.strip()
        series_genres.append(series_genre)

In [10]:
game_names = []
game_years = []
game_imdb_ratings = []
game_metascores = []
game_votes = []
game_ages = []
game_runtimes = []
game_genres = []

for content in game_contents:
    if content.find('span', attrs = {'class':'certificate'}) is not None:
    # The name of the game
        game_name = content.h3.a.text
        game_names.append(game_name)
    # The year of the game
        game_year = content.h3.find('span', class_ = 'lister-item-year').text.strip('( Video Game)')
        game_years.append(game_year)
    # The rating of the game
        game_imdb = float(content.strong.text)
        game_imdb_ratings.append(game_imdb)
    # The number of votes of the game
        game_vote = content.find('span', attrs = {'name':'nv'})['data-value']
        game_votes.append(int(game_vote))
    #the Age rating of the game
        game_age_rating = content.find('span', attrs = {'class':'certificate'}).text
        game_ages.append(game_age_rating)
    #The genres of the game
        game_genre = content.find('span', attrs = {'class':'genre'}).text.strip()
        game_genres.append(game_genre)

## Here i'm creating a dataframe of movies, series and games

In [11]:
movie_dataframe = pd.DataFrame({
'movie': movies_names,
'year': movies_years,
'imdb': movies_imdb_ratings,
'number_votes': movies_votes,
'age_rating':movies_ages,
'runtime':movies_runtimes,
'genres':movies_genres
})

series_dataframe = pd.DataFrame({
'serie': series_names,
'year': series_years,
'imdb': series_imdb_ratings,
'number_votes': series_votes,
'age_rating':series_ages,
'runtime':series_runtimes,
'genres':series_genres
})

games_dataframe = pd.DataFrame({
'Game': game_names,
'year': game_years,
'imdb': game_imdb_ratings,
'number_votes': game_votes,
'age_rating':game_ages,
'genres':game_genres
})

## I'm looking what is in every dataframe

In [12]:
movie_dataframe.head()

Unnamed: 0,movie,year,imdb,number_votes,age_rating,runtime,genres
0,Deadpool,2016,8.0,852465,16,108,"Action, Adventure, Comedy"
1,Star Wars: O Despertar da Força,2015,7.9,817641,12,138,"Action, Adventure, Sci-Fi"
2,Mad Max: Estrada da Fúria,2015,8.1,812677,16,120,"Action, Adventure, Sci-Fi"
3,Vingadores: Guerra Infinita,2018,8.5,736936,12,149,"Action, Adventure, Sci-Fi"
4,Perdido em Marte,2015,8.0,708099,12,144,"Adventure, Drama, Sci-Fi"


In [13]:
series_dataframe.head()

Unnamed: 0,serie,year,imdb,number_votes,age_rating,runtime,genres
0,Stranger Things,(2016– ),8.8,693450,14,51,"Drama, Fantasy, Horror"
1,Westworld,(2016– ),8.7,370915,16,62,"Drama, Mystery, Sci-Fi"
2,Demolidor,(2015–2018),8.6,340702,18,54,"Action, Crime, Drama"
3,Narcos,(2015–2017),8.8,314157,16,49,"Biography, Crime, Drama"
4,Mr. Robot: Sociedade Hacker,(2015–2019),8.5,300530,16,49,"Crime, Drama, Thriller"


In [14]:
games_dataframe.head()

Unnamed: 0,Game,year,imdb,number_votes,age_rating,genres
0,Uncharted 4: A Thief's End,2016,9.5,17644,14,"Action, Adventure, Drama"
1,Wiedzmin 3: Dziki Gon,2015,9.7,15903,16,"Action, Adventure, Drama"
2,Red Dead Redemption II,2018,9.7,15329,18,"Action, Adventure, Crime"
3,God of War,2018,9.6,12691,18,"Action, Adventure, Drama"
4,Batman: Arkham Knight,2015,9.0,12512,16,"Action, Adventure, Crime"


## And finally, parsing the dataframes to a csv file

In [16]:
movie_dataframe.to_csv('imdb_movies.csv')
series_dataframe.to_csv('imdb_series.csv')
games_dataframe.to_csv('imdb_games.csv')