### https://youtu.be/0ws5tsRBgL8

# https://www.kinopoisk.ru/lists/top250 web-scraping

### Using <i> requests </i> library to get the html code of this website

In [1]:
import requests

In [34]:
url = "https://www.kinopoisk.ru/lists/top250/"

r = requests.get(url)
#r.text


### using <i> bs4 </i> library to work with the collected code

In [4]:
from bs4 import BeautifulSoup

In [35]:
soup = BeautifulSoup(r.text, "lxml")
#soup


#### All films cards have special tag <br>
<font color="green"> div class="desktop-rating-selection-film-item" </font>

In [36]:
#soup.find("div", class_ ="desktop-rating-selection-film-item")

#### looking for film's link (tag "a" class "selection-film-item-meta__link" due to collected html code)

In [6]:
soup.find("div", class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link")

<a class="selection-film-item-meta__link" href="/film/435/"><p class="selection-film-item-meta__name">Зеленая миля</p><p class="selection-film-item-meta__original-name">Green Mile, The, 1999</p><p class="selection-film-item-meta__meta-additional"><span class="selection-film-item-meta__meta-additional-item">США</span><span class="selection-film-item-meta__meta-additional-item">драма, криминал</span></p></a>

In [7]:
soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a",
                                                             class_="selection-film-item-meta__link").get("href")

'/film/435/'

In [8]:
link = "kinopoisk.ru"+ soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a",
                                                             class_="selection-film-item-meta__link").get("href")

link

'kinopoisk.ru/film/435/'

#### looking for film's name (in russian)

In [9]:
russian_name = soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link").find("p",
                                                             class_="selection-film-item-meta__name").text
russian_name

'Зеленая миля'

####  looking for original film's name

In [10]:
original_name = soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link").find("p",
                                                             class_="selection-film-item-meta__original-name").text

original_name

'Green Mile, The, 1999'

#### looking for film's country and genre

In [11]:
country = soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link").find("span",
                                                             class_="selection-film-item-meta__meta-additional-item").text

country

'США'

#### Both country and genre has same tags and classes. We will use <i> findAll </i> to get the list of all objects of this class and then we will divide them

In [12]:
soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link").findAll("span",
                                                             class_="selection-film-item-meta__meta-additional-item")


[<span class="selection-film-item-meta__meta-additional-item">США</span>,
 <span class="selection-film-item-meta__meta-additional-item">драма, криминал</span>]

#### the element with a 0 index is the country, while 1 is the genre

In [13]:
film_type = soup.find("div",
          class_ ="desktop-rating-selection-film-item").find("a", class_="selection-film-item-meta__link").findAll("span",
                                                             class_="selection-film-item-meta__meta-additional-item")[1].text

film_type

'драма, криминал'

#### looking for film's rating

In [14]:
rate = soup.find("div", class_ ="desktop-rating-selection-film-item").find("span", class_ = "rating__value rating__value_positive").text

rate

'9.0'

## Collecting data from all films' cards on the page

In [22]:
films = soup.findAll("div", class_ = "desktop-rating-selection-film-item" )
len(films)

50

#### There is information only about 50 films, but we are working with top-250 list, so there are several pages

In [23]:
data = []

for film in films:
    link = "kinopoisk.ru" + film.find("a", class_="selection-film-item-meta__link").get("href")
    russian_name = film.find("a", class_="selection-film-item-meta__link").find("p",
                                                             class_="selection-film-item-meta__name").text
    original_name = film.find("a", class_="selection-film-item-meta__link").find("p",
                                                             class_="selection-film-item-meta__original-name").text
    country = film.find("a", class_="selection-film-item-meta__link").find("span",
                                                             class_="selection-film-item-meta__meta-additional-item").text
    film_type = film.find("a", class_="selection-film-item-meta__link").findAll("span",
                                                             class_="selection-film-item-meta__meta-additional-item")[1].text
    rate = film.find("span", class_ = "rating__value rating__value_positive").text
    
    data.append([link, russian_name, original_name, country, film_type, rate])




In [37]:
#data

## My simple plan - swithing pages changing number in url with
```python
for p in range(1, 6):
    url = f"https://www.kinopoisk.ru/lists/top250/?page={p}&tab=all"
```
## - doesn't work, because this code causes captcha


## We have to use more complicated way of web-scraping, using the <i> Selenium </i> library and their special driver

In [19]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from tqdm import tqdm

In [25]:
s = Service("./chromedriver")
browser = Chrome(service = s)
url = "https://www.kinopoisk.ru/lists/top250/"
browser.get(url)

In [29]:
data = []

for i in tqdm(range(5)):
    sleep(7)
    soup = BeautifulSoup(browser.page_source, 'lxml')
    films = soup.findAll("div", class_ = "desktop-rating-selection-film-item")
    
    for film in films:
        try:
            link = "kinopoisk.ru" + film.find("a", class_="selection-film-item-meta__link").get("href")
        except:
            link = ""
        try:
            russian_name = film.find("a", class_="selection-film-item-meta__link").find("p", class_="selection-film-item-meta__name").text
        except:
            russian_name = ""
        try:
            original_name = film.find("a", class_="selection-film-item-meta__link").find("p", class_="selection-film-item-meta__original-name").text
        except:
            original_name = ""
        try:
            country = film.find("a", class_="selection-film-item-meta__link").find("span", class_="selection-film-item-meta__meta-additional-item").text
        except:
            country = ""
        try:
            film_type = film.find("a", class_="selection-film-item-meta__link").findAll("span", class_="selection-film-item-meta__meta-additional-item")[1].text
        except:
            film_type = ""
        try:
            rate = film.find("span", class_ = "rating__value rating__value_positive").text
        except:
            rate = ""
        data.append([link, russian_name, original_name, country, film_type, rate])
    try:
        browser.find_elements(By.CLASS_NAME, "paginator__page-relative")[-1].click()
        # On all pages, except the first one, there are two buttons with such class: "Previous" and "Next" page,
        # - so we are going to find the "Next" button in all cases, which is always the last one
        # - 1 of 1 on the first page and 2 of 2 on others
    except:
        break


100%|██████████| 5/5 [00:38<00:00,  7.67s/it]


In [30]:
len(data)

250

In [31]:
import pandas as pd

In [32]:
header = ["link", "russian_name", "original_name", "country", "film_type", "rate"]

In [33]:
df = pd.DataFrame(data, columns = header)
df.to_csv("./kinopoisk_data.csv", sep = ";", encoding = "utf8")