<a href="https://colab.research.google.com/github/Klaudia2022/Obliczenianaukowe/blob/main/Aspekty_informatyczne_analizy_danych.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
import pandas as pd
import polars as ps
import re
import requests

from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from tqdm import tqdm

In [42]:
class Filmy:
    def __init__(self, biblioteka: str = 'pandas'):
      """
      biblioteka - jakiej biblioteki należy użyć do zapisu danych: pandas lub polars
      domyślnie pandas
      """

      self.biblioteka = biblioteka
      self.website = requests.get('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture')

      soup = BeautifulSoup(website.text, 'html.parser')

      years = []
      movies = []
      links = []

      for table_row in tqdm(soup.find_all('tr')):
        for row in table_row.find_all('th', rowspan = True):

          year = table_row.text
          years.append(year[:year.index('(')])

          movie = row.parent.find_next_sibling('tr', style = True)
          movies.append(movie.a['title'])

          links.append('https://en.wikipedia.org' + movie.a['href'])

      if self.biblioteka == 'pandas':
        data = {'Rok': years, 'Tytuł': movies, 'Link': links}
        self.df = pd.DataFrame(data)

      elif self.biblioteka == 'polars':
        data = {'Rok': years, 'Tytuł': movies, 'Link': links}
        self.df = ps.DataFrame(data)
      else:
        raise(ValueError('Podaj bibliotekę pandas lub polars'))

    def csv_save(self) -> None:
      """
      zapisanie danych do pliku csv
      """
      if self.biblioteka == 'pandas':
        with open('Academy_Awards.csv', 'w') as file:
          file.write(','.join(self.df.columns) + '\n')                            #nagłówki

          for index, row in self.df.iterrows():
            file.write(','.join(map(str, row.values)) + '\n')                     #wiersze

      else:
        with open('Academy_Awards.csv', 'w') as file:
          file.write(','.join(self.df.columns) + '\n')

          for row in self.df.iter_rows():
            file.write(','.join(map(str, row)) + '\n')

    def json_save(self) -> None:
      """
      zapisanie danych do pliku json
      """
      if self.biblioteka == 'pandas':
        self.df.to_json('Academy_Awards.json', orient = 'records', lines = True)
      else:
        self.df.write_json('Academy_Awards.json')

    def docx_save(self) -> None:
      """
      zapisane danych do pliku docx
      """
      document = Document()
      paragraph = document.add_heading('Academy Award for Best Picture', 0)
      paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

      if self.biblioteka == 'pandas':
        for index, row in self.df.iterrows():
          for i in range(len(self.df.columns)):
            p = document.add_paragraph()
            p.add_run(self.df.columns[i] + ': ').bold = True
            p.add_run(row[i])

          document.add_page_break()

        document.save('Academy_Awards.docx')

      else:
        for row in self.df.iter_rows():
          for i in range(len(self.df.columns)):
            p = document.add_paragraph()
            p.add_run(self.df.columns[i] + ': ').bold = True
            p.add_run(row[i])

          document.add_page_break()
        document.save('Academy_Awards.docx')


    @staticmethod
    def movie_detail_one(url: str) -> tuple[str, str, str]:
      movie_website = requests.get(url)
      movie_soup = BeautifulSoup(movie_website.text, 'html.parser')

      for el in movie_soup.find_all('tr'):
        if el.text.startswith('Directed by'):
          director = el.text.split('Directed by')[1]


      plot = ''
      plot_link = movie_soup.find_all('h2', {'id': 'Plot'})
      for el in plot_link:
        sib = el.parent.find_next_sibling('p')
        while sib.name == 'p':
          plot = plot + sib.text
          sib = sib.find_next_sibling()
      return plot, director, url

    def movie_detail(self, max_workers: int = 5) -> None:
      """
      Uzyskanie informacji z podstron o reżyserze i fabule filmu
      """
      movie_plot = []
      director = []
      websites = []
      results = []

      with ThreadPoolExecutor(max_workers = max_workers) as executor:
        futures = []
        for argument in tqdm(self.df['Link']):
            futures.append(executor.submit(self.movie_detail_one, argument))
        for future in futures:
            results.append(future.result())

      for r in tqdm(results):
        movie_plot.append(r[0])
        director.append(r[1])
        websites.append(r[2])

      if self.biblioteka == 'pandas':
        df2 = pd.DataFrame({'Fabuła': movie_plot, 'Reżyser': director, 'Link': websites})

        self.df = pd.merge(self.df, df2, how = 'left', on = 'Link')
        self.df = self.df.drop(columns = ['Link'])
      else:
        df2 = ps.DataFrame({'Fabuła': movie_plot, 'Reżyser': director, 'Link': websites})

        self.df = self.df.join(df2, on = 'Link', how = 'left')
        self.df = self.df.drop('Link')