In [58]:
from bs4 import BeautifulSoup
import re
import csv

def clean_title(title):
    return ' '.join(title.split()).strip()

def extract_movie_details(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    movies = soup.find_all('tr')[1:]  # Skipping the header row
    movie_list = []

    for movie in movies:
        try:
            info_cell = movie.find_all('td')[1]
            full_text = info_cell.get_text(separator=' ').strip()

            # Improved extraction of title and year
            title_year_match = re.search(r'([^\(]+)\((\d{4})', full_text)
            if title_year_match:
                title = clean_title(title_year_match.group(1))
                year = title_year_match.group(2).strip()
            else:
                continue  # Skip the movie if title/year not found

            # Extract runtime
            runtime_match = re.search(r'(\d+)\s+minutes', full_text)
            runtime = runtime_match.group(1) if runtime_match else "Runtime not found"
        
            movie_list.append((title, year, runtime))
        except Exception as e:
            print(f"An error occurred while processing a movie: {e}")

    return movie_list


In [59]:
# List of filenames from 1950 to 2023
years = range(1940, 2024)  # From 1950 to 2023 inclusive
directory = "html/"
filenames = [f"{directory}{year}.html" for year in years]

# CSV filename
csv_filename = 'movies.csv'

# Append to the csv file
with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Process each HTML file
    for filename in filenames:
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                html_content = file.read()

            # Extract movie details
            movie_details = extract_movie_details(html_content)

            # Write each movie's details to the CSV
            for movie in movie_details:
                csvwriter.writerow(movie)

        except Exception as e:
            print(f"An error occurred while processing file {filename}: {e}")

print(f"Movie details have been appended to {csv_filename}")

Movie details have been appended to movies.csv


In [60]:
with open('html/1979.html', 'r', encoding='utf-8') as file:
    html_content = file.read()
# Extract movie details
movie_details = extract_movie_details(html_content)
for movie in movie_details:
    print(movie)


('Alien', '1979', '116')
('All That Jazz', '1979', '123')
('...And Justice For All', '1979', '119')
('Apocalypse Now', '1979', '139')
('Being There', '1979', '130')
('The Black Stallion', '1979', '118')
('Breaking Away', '1979', '100')
('The China Syndrome', '1979', '122')
('The Jerk', '1979', '94')
('Kramer vs. Kramer', '1979', '104')
('Mad Max', '1979', '93')
('Manhattan', '1979', '96')
('The Marriage of Maria Braun', '1979', '120')
("Monty Python's) Life of Brian", '1979', '93')
('Moscow Does Not Believe in Tears', '1979', '140')
('The Muppet Movie', '1979', '95')
('Norma Rae', '1979', '110')
('North Dallas Forty', '1979', '119')
('Nosferatu: The Vampyr', '1979', '107')
('The Onion Field', '1979', '126')
('Real Life', '1979', '99')
('Star Trek - The Motion Picture', '1979', '132')
('10', '1979', '121')
('Tess', '1979', '170')
('The Tin Drum', '1979', '142')
('The Warriors', '1979', '90')
