# Scraping a WIKI pages for make a DataSet

##### Import libraries

In [3]:
from bs4 import BeautifulSoup as bs # for scrape
import requests
import time
import random
import pandas as pd

In [4]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ',strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ',strip=True).replace('\xa0', ' ')

def get_info_box(url):
    try:
        r = requests.get(url)

        soup = bs(r.content)
        info_box = soup.find(class_='infobox vevent')
        if info_box is None:
            print(f"  No infobox found for {url}")
            return None
        
        info_rows = info_box.find_all('tr')

        movie_info = {}

        for index, row in enumerate(info_rows):
            if index == 0:
                movie_info['title'] = row.find('th').get_text(' ',strip=True)
            elif index == 1:
                continue
            else:    
                content_key = row.find('th').get_text(' ',strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value
        
        return movie_info
    except Exception as e:
        print(f"Can't check this URL {url}: {e}")
        return None


In [5]:
all_movie_info = []

for year in range(1950, 2025):
    print(f"Collecting data for {year} year...")
    
    url = f'https://en.wikipedia.org/wiki/{year}_in_film'
    
    r = requests.get(url)
    soup = bs(r.content)
    
    movies = soup.select(".wikitable.sortable i a")
    base_path = 'https://en.wikipedia.org/'
    
    for index, movie in enumerate(movies):
        if index >= 10:  
            break
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        print(f"Working with: {title}")
        
        movie_info = get_info_box(full_path)
        if movie_info is not None:
            movie_info['year'] = year 
            all_movie_info.append(movie_info)
        else:
            print(f"  Warning: Could not retrieve info for {title}")
        
        time.sleep(random.uniform(1, 3))
    
    time.sleep(random.uniform(2, 5))

Collecting data for 1950 year...
Working with: Samson and Delilah (1949 film)
Working with: King Solomon's Mines (1950 film)
Working with: Annie Get Your Gun (film)
Working with: Cheaper by the Dozen (1950 film)
Working with: Cinderella (1950 film)
Working with: Born Yesterday (1950 film)
Working with: Father of the Bride (1950 film)
Working with: Broken Arrow (1950 film)
Working with: All About Eve
Working with: Three Little Words (film)
Collecting data for 1951 year...
Working with: Quo Vadis (1951 film)
Working with: Show Boat (1951 film)
Working with: David and Bathsheba (film)
Working with: The Great Caruso
Working with: A Streetcar Named Desire (1951 film)
Working with: The African Queen (film)
Working with: That's My Boy (1951 film)
Working with: An American in Paris (film)
Working with: A Place in the Sun (1951 film)
Working with: At War with the Army
Collecting data for 1952 year...
Working with: The Greatest Show on Earth (film)
Working with: This Is Cinerama
Working with: Th

In [8]:
all_movie_info_copy = pd.DataFrame(all_movie_info)


In [12]:
all_movie_info_copy.to_csv('./data/All_Movies_Info',index=False)