# Day 15: 30 Days of python programming

## Python PIP - Python Package Manager

In [1]:
#Q1: Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112-0.txt'

# import requests
# import webbrowser

# url = 'http://www.gutenberg.org/files/1112/1112-0.txt'

# webbrowser.open_new(url)


import requests
from collections import Counter
import re

# Fetch the text of Romeo and Juliet
url = 'http://www.gutenberg.org/files/1112/1112.txt'
response = requests.get(url)
text = response.text

# Clean and split the text into words
words = re.findall(r'\b\w+\b', text.lower())

# Find the 10 most common words
most_common_words = Counter(words).most_common(10)
most_common_words


[('a', 70),
 ('li', 60),
 ('href', 40),
 ('class', 22),
 ('html', 20),
 ('gutenberg', 20),
 ('content', 14),
 ('div', 14),
 ('help', 14),
 ('meta', 13)]

In [1]:
''' Q2: Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
the min, max, mean, median, standard deviation of cats' weight in metric units.
the min, max, mean, median, standard deviation of cats' lifespan in years.
Create a frequency table of country and breed of cats'''

import requests
import numpy as np
import pandas as pd

# Fetch the data from the Cat API
cats_api = 'https://api.thecatapi.com/v1/breeds'
response = requests.get(cats_api)
cats_data = response.json()

# Extract weight (metric) and lifespan
weights = []
lifespans = []
countries = []
breeds = []

for cat in cats_data:
    # Get weight in metric (split into min/max values)
    weight = cat['weight']['metric'].split(" - ")
    weights.append([int(weight[0]), int(weight[1])])
    
    # Get lifespan (split into min/max values)
    lifespan = cat['life_span'].split(" - ")
    lifespans.append([int(lifespan[0]), int(lifespan[1])])
    
    # Get country and breed
    countries.append(cat['origin'])
    breeds.append(cat['name'])

# Calculate min, max, mean, median, and std for weights and lifespans
weights_array = np.array(weights)
weight_min = weights_array[:, 0].min()
weight_max = weights_array[:, 1].max()
weight_mean = weights_array.mean()
weight_median = np.median(weights_array)
weight_std = weights_array.std()

lifespans_array = np.array(lifespans)
lifespan_min = lifespans_array[:, 0].min()
lifespan_max = lifespans_array[:, 1].max()
lifespan_mean = lifespans_array.mean()
lifespan_median = np.median(lifespans_array)
lifespan_std = lifespans_array.std()

# Create a frequency table of countries and breeds
df = pd.DataFrame({
    'Country': countries,
    'Breed': breeds
})

frequency_table = df.groupby('Country')['Breed'].count()

# Print results
print(f"Weight (metric) - Min: {weight_min}, Max: {weight_max}, Mean: {weight_mean:.2f}, Median: {weight_median}, Std: {weight_std:.2f}")
print(f"Lifespan (years) - Min: {lifespan_min}, Max: {lifespan_max}, Mean: {lifespan_mean:.2f}, Median: {lifespan_median}, Std: {lifespan_std:.2f}")
print("\nFrequency Table of Country and Breeds:")
print(frequency_table)



Weight (metric) - Min: 2, Max: 11, Mean: 4.71, Median: 5.0, Std: 1.89
Lifespan (years) - Min: 8, Max: 20, Mean: 13.75, Median: 14.0, Std: 2.40

Frequency Table of Country and Breeds:
Country
Australia                1
Burma                    2
Canada                   3
China                    1
Cyprus                   1
Egypt                    3
France                   2
Greece                   1
Iran (Persia)            1
Isle of Man              1
Japan                    1
Norway                   1
Russia                   4
Singapore                1
Somalia                  1
Thailand                 4
Turkey                   2
United Arab Emirates     1
United Kingdom           8
United States           28
Name: Breed, dtype: int64


In [2]:
'''
Q3: Read the countries API and find
the 10 largest countries
the 10 most spoken languages
the total number of languages in the countries API
'''

import requests
import pandas as pd
from collections import Counter

# Fetch the data from the Countries API
countries_api = 'https://restcountries.com/v2/all'
response = requests.get(countries_api)
countries_data = response.json()

# Find the 10 largest countries by area
countries_by_area = sorted(countries_data, key=lambda x: x.get('area', 0), reverse=True)[:10]
largest_countries = [(country['name'], country['area']) for country in countries_by_area]

# Extract and count languages
all_languages = []
for country in countries_data:
    for language in country.get('languages', []):
        all_languages.append(language['name'])

# Find the 10 most spoken languages
language_counts = Counter(all_languages)
most_spoken_languages = language_counts.most_common(10)

# Find the total number of distinct languages
total_languages = len(language_counts)

# Print results
print("The 10 largest countries by area:")
for country, area in largest_countries:
    print(f"{country}: {area} sq km")

print("\nThe 10 most spoken languages:")
for language, count in most_spoken_languages:
    print(f"{language}: spoken in {count} countries")

print(f"\nTotal number of distinct languages: {total_languages}")



The 10 largest countries by area:
Russian Federation: 17124442.0 sq km
Antarctica: 14000000.0 sq km
Canada: 9984670.0 sq km
China: 9640011.0 sq km
United States of America: 9629091.0 sq km
Brazil: 8515767.0 sq km
Australia: 7692024.0 sq km
India: 3287590.0 sq km
Argentina: 2780400.0 sq km
Kazakhstan: 2724900.0 sq km

The 10 most spoken languages:
English: spoken in 91 countries
French: spoken in 45 countries
Arabic: spoken in 25 countries
Spanish: spoken in 24 countries
Portuguese: spoken in 10 countries
Russian: spoken in 8 countries
Dutch: spoken in 8 countries
German: spoken in 7 countries
Chinese: spoken in 5 countries
Serbian: spoken in 4 countries

Total number of distinct languages: 123


In [1]:
'''Q4: UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4
'''

import requests
from bs4 import BeautifulSoup

# Fetch the content of the UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/datasets.php'
response = requests.get(url)

# Parse the page content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the dataset names in the table (usually under 'table' tags)
table = soup.find_all('table', {'cellpadding': '3'})

# Extract dataset names from the table if table is not empty
datasets = []
if table:
    for row in table[0].find_all('tr')[1:]:
        cols = row.find_all('td')
        if cols:
            dataset_name = cols[0].text.strip()
            datasets.append(dataset_name)

# Print the list of dataset names
print("UCI Machine Learning Repository Datasets:")
for i, dataset in enumerate(datasets, 1):
    print(f"{i}. {dataset}")



UCI Machine Learning Repository Datasets:
