In [6]:
pip install --upgrade pip

Collecting pip
  Using cached pip-24.3.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.2.3
    Uninstalling pip-20.2.3:
      Successfully uninstalled pip-20.2.3
Successfully installed pip-24.3.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install zeep

Note: you may need to restart the kernel to use updated packages.


In [29]:
import random
import re
import requests
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
import csv
from zeep import Client

In [3]:
def check_webpage(url):
    try:
        with urlopen(url) as response:
            return f"Page found with status: {response.status}"
    except HTTPError as e:
        return f"HTTPError: {e.code} - {e.reason}"
    except URLError as e:
        return f"URLError: {e.reason}"
    except Exception as e:
        return f"Unexpected error: {str(e)}"

In [4]:
def check_ssl(url):
    try:
        response = requests.get(url, verify=True)
        return f"SSL Certificate is valid. Status Code: {response.status_code}"
    except requests.exceptions.SSLError:
        return "SSL Certificate is not valid."
    except Exception as e:
        return f"Error: {str(e)}"

In [5]:
def fetch_site_info(url):
    response = requests.get(url)
    return {
        "Status Code": response.status_code,
        "Headers": response.headers,
        "URL": response.url,
        "History": response.history,
        "Encoding": response.encoding,
        "Reason": response.reason,
        "Cookies": response.cookies.get_dict(),
        "Elapsed Time": response.elapsed,
        "Request": response.request,
        "Content": response.content[:500]  # Trim content for readability
    }

In [6]:
def fetch_robots_txt(url):
    response = requests.get(url + "/robots.txt")
    return response.text if response.status_code == 200 else "robots.txt not found."


In [7]:
def fetch_h1(url):
    response = urlopen(url)
    soup = BeautifulSoup(response, "html.parser")
    h1_tag = soup.find("h1")
    return h1_tag.text.strip() if h1_tag else "No h1 tag found."

In [8]:
def fetch_headers(url):
    response = urlopen(url)
    soup = BeautifulSoup(response, "html.parser")
    headers = soup.find_all(re.compile("^h[1-6]$"))
    return [header.text.strip() for header in headers]

In [9]:
def fetch_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = [a['href'] for a in soup.find_all('a', href=True)]
    return links

In [10]:
def count_csv_rows(url):
    response = requests.get(url)
    lines = response.text.splitlines()
    reader = csv.reader(lines)
    return sum(1 for row in reader)

In [35]:
def fetch_imdb_random_movies(url, count=10):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return f"Failed to fetch IMDb page. Status Code: {response.status_code}"
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Проверка селекторов
    movies = soup.select("td.titleColumn a")
    descriptions = soup.select("td.titleColumn span.secondaryInfo")
    
    if not movies or not descriptions:
        return "Failed to parse IMDb page. No movies found."

    movie_list = [
        {"title": movie.text.strip(), "description": desc.text.strip()}
        for movie, desc in zip(movies, descriptions)
    ]

    if len(movie_list) < count:
        return f"Requested {count} movies, but only {len(movie_list)} are available."

    return random.sample(movie_list, count)

In [12]:
def soap_service_example(wsdl_url, method_name, **kwargs):
    client = Client(wsdl_url)
    method = getattr(client.service, method_name)
    return method(**kwargs)

In [16]:
print(check_webpage("http://www.rule34videos.com"))

Page found with status: 200


In [18]:
print(check_ssl("https://www.python.org"))

SSL Certificate is valid. Status Code: 200


In [20]:
print(fetch_site_info("https://www.python.org"))

{'Status Code': 200, 'Headers': {'Connection': 'keep-alive', 'Content-Length': '12095', 'content-type': 'text/html; charset=utf-8', 'x-frame-options': 'SAMEORIGIN', 'via': '1.1 varnish, 1.1 varnish, 1.1 varnish', 'content-encoding': 'gzip', 'Accept-Ranges': 'bytes', 'Date': 'Mon, 25 Nov 2024 15:34:32 GMT', 'Age': '3586', 'X-Served-By': 'cache-iad-kiad7000114-IAD, cache-iad-kiad7000114-IAD, cache-ams2100117-AMS', 'X-Cache': 'MISS, HIT, HIT', 'X-Cache-Hits': '0, 41, 4', 'X-Timer': 'S1732548872.210428,VS0,VE0', 'Vary': 'Cookie', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload'}, 'URL': 'https://www.python.org/', 'History': [], 'Encoding': 'utf-8', 'Reason': 'OK', 'Cookies': {}, 'Elapsed Time': datetime.timedelta(microseconds=401662), 'Request': <PreparedRequest [GET]>, 'Content': b'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->\n<!

In [21]:
print(fetch_robots_txt("https://en.wikipedia.org"))

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: 

In [22]:
print(fetch_h1("http://www.example.com"))

Example Domain


In [30]:
print(fetch_headers("https://en.wikipedia.org/wiki/Main_Page"))

['Main Page', 'Welcome to Wikipedia', "From today's featured article", 'Did you know\xa0...', 'In the news', 'On this day', "From today's featured list", "Today's featured picture", 'Other areas of Wikipedia', "Wikipedia's sister projects", 'Wikipedia languages']


In [24]:
print(fetch_links("https://en.wikipedia.org/wiki/Python"))

['#bodyContent', '/wiki/Main_Page', '/wiki/Wikipedia:Contents', '/wiki/Portal:Current_events', '/wiki/Special:Random', '/wiki/Wikipedia:About', '//en.wikipedia.org/wiki/Wikipedia:Contact_us', '/wiki/Help:Contents', '/wiki/Help:Introduction', '/wiki/Wikipedia:Community_portal', '/wiki/Special:RecentChanges', '/wiki/Wikipedia:File_upload_wizard', '/wiki/Main_Page', '/wiki/Special:Search', 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en', '/w/index.php?title=Special:CreateAccount&returnto=Python', '/w/index.php?title=Special:UserLogin&returnto=Python', 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en', '/w/index.php?title=Special:CreateAccount&returnto=Python', '/w/index.php?title=Special:UserLogin&returnto=Python', '/wiki/Help:Introduction', '/wiki/Special:MyContributions', '/wiki/Special:MyTalk'

In [25]:
print(count_csv_rows("http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/4.5_month.csv"))

416


In [36]:
imdb_url = "https://www.imdb.com/chart/top"
print("\nRandom IMDb Movies:")
imdb_movies = fetch_imdb_random_movies(imdb_url, count=10)
if isinstance(imdb_movies, str):
    print(imdb_movies)
else:
    for movie in imdb_movies:
        print(f"{movie['title']}: {movie['description']}")


Random IMDb Movies:
Failed to parse IMDb page. No movies found.


In [28]:
wsdl = "http://webservices.oorsprong.org/websamples.countryinfo/CountryInfoService.wso?WSDL"
print(soap_service_example(wsdl, "FullCountryInfo", sCountryISOCode="US"))

{
    'sISOCode': 'US',
    'sName': 'United States',
    'sCapitalCity': 'Washington',
    'sPhoneCode': '1',
    'sContinentCode': 'AM',
    'sCurrencyISOCode': 'USD',
    'sCountryFlag': 'http://www.oorsprong.org/WebSamples.CountryInfo/Flags/USA.jpg',
    'Languages': {
        'tLanguage': [
            {
                'sISOCode': 'eng',
                'sName': 'English'
            }
        ]
    }
}
