# What is Statistical Inference and Why do Digital Humanists need it?

**Statistical Inference:** the theory, methods, and practice of forming judgements about the parameters of a population and the reliability of statistical relationships, typically on the basis of <u> random sampling </u>. (Oxford Dictionary)

**Another definition:** guessing the real value of something based only on a limited sample of observations. 

To do statistical inference, we can examine which feature impacts a certain target more. 

The dataset we are going to analyze was downloaded from IMDB. We want to see what features have the biggest impact on the rating of a certain movie. 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
# Function to scrape individual movie details
def scrape_movie_details(movie_url):
    response = requests.get(movie_url)
    print("Access status:", response.status_code)
    if response.status_code == 200:
        print("Access granted. Scraping successful.")
        soup = BeautifulSoup(response.text, 'html.parser')
        
        title = soup.find('h1').text.strip()
        year = soup.find('span', class_='TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex').text.strip()
        actors = [actor.text.strip() for actor in soup.find_all('a', class_='StyledComponents__ActorName-y9ygcu-1 eyqFnv')]
        genre = [genre.text.strip() for genre in soup.find_all('a', class_='GenresAndPlot__GenreChip-cum89p-3 fzmeux ipc-chip ipc-chip--on-baseAlt')]
        rating = soup.find('span', class_='AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV').text.strip()
        
        return {
            'title': title,
            'year': year,
            'actors': actors[:5],  # Select first 5 actors
            'genre': genre,
            'rating': rating
        }
    else:
        print("Access denied. Scraping failed.1")
        return None

In [8]:
# Function to scrape the list of movies
def scrape_movie_list(url):
    response = requests.get(url)
    print("Access status:", response.status_code)
    if response.status_code == 200:
        print("Access granted. Scraping successful.")
        soup = BeautifulSoup(response.text, 'html.parser')
        movie_list = soup.find_all('div', class_='lister-item-content')

        movie_details = []
        for movie in movie_list:
            movie_link = movie.find('a', href=True)['href']
            movie_url = f'https://www.imdb.com{movie_link}'
            movie_info = scrape_movie_details(movie_url)
            if movie_info:
                movie_details.append(movie_info)

        return movie_details
    else:
        print("Access denied. Scraping failed.2")
        return None

In [9]:
# Main function to scrape IMDb
def scrape_imdb(url):
    movie_details = scrape_movie_list(url)
    return movie_details

In [10]:
# The search URL includes movies made in the US in the last three months of the year 2000:
url= 'https://www.imdb.com/search/title/?release_date=2000-01-01,2000-01-15&country_of_origin=US'

imdb_data = scrape_imdb(url)
imdb_data

# Print the data
# for movie in imdb_data:
#     print(movie)

Access status: 403
Access denied. Scraping failed.2
