### Web scrapping for [Sayarh Website](https://syarah.com)

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import requests


## Page Title

In [3]:
url = "https://www.imdb.com/list/ls057823854/"  

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    page_title = soup.title.string.strip()  
    print("Page Title:", page_title)
else:
    print(f"Request failed with status code: {response.status_code}")

Page Title: All U.S. Released Movies: 1972-2016


## loop


In [4]:
url = 'https://www.imdb.com/list/ls057823854/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")

#### Try selenium

In [7]:
driver = webdriver.Chrome()
# request the page 
driver.get(url)

        # extract the body and parse it
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [9]:


# Initialize WebDriver
driver = webdriver.Chrome()
base_url = 'https://www.imdb.com/list/ls057823854/?page='  # Pagination URL
columns_ = ['Title', 'Year', 'Duration', 'Age Category', 'IMDb Rating', 'Votes', 'Metascore']
df2 = pd.DataFrame(columns=columns_)

# Loop through pages (adjust range if needed)
for i in range(1,41):  # Fetch pages 1 to 2 (Increase if needed)
    url = base_url + str(i)  # Modify URL for pagination
    driver.get(url)  # Load the new page

    # Wait for content to load
    WebDriverWait(driver, 13).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "h3.ipc-title__text"))
    )

    # Extract the body and parse it
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract movie titles
    titles = soup.select("h3.ipc-title__text")

    # Extract metadata block
    metadata_blocks = soup.select("div[class*='dli-title-metadata']")

    # Extract IMDb ratings and vote counts
    rating_blocks = soup.select("span.ipc-rating-star--rating")
    vote_blocks = soup.select("span.ipc-rating-star--voteCount")

    # Extract Metascore
    metascore_blocks = soup.select("span[class*='metacritic-score-box']")  # Adjust this if necessary

    clean_data = []

    for title, metadata, rating, vote, metascore in zip(
        titles, metadata_blocks, rating_blocks, vote_blocks, metascore_blocks
    ):
        year, duration, category = "Unknown", "Unknown", "Unknown"
        imdb_rating, votes, metascore_value = "N/A", "N/A", "N/A"

        # Extract metadata values
        metadata_items = metadata.select("span[class*='dli-title-metadata-item']")
        if len(metadata_items) > 0:
            year = metadata_items[0].text.strip()
        if len(metadata_items) > 1:
            duration = metadata_items[1].text.strip()
        if len(metadata_items) > 2:
            category = metadata_items[2].text.strip()

        # Extract IMDb rating and votes
        imdb_rating = rating.text.strip() if rating else "N/A"
        votes = vote.text.strip() if vote else "N/A"

        # Extract Metascore
        metascore_value = metascore.text.strip() if metascore else "N/A"

        clean_data.append([title.text.strip(), year, duration, category, imdb_rating, votes, metascore_value])

    new_rows_df2 = pd.DataFrame(clean_data, columns=columns_)
    df2 = pd.concat([df2, new_rows_df2], ignore_index=True)

# Close the WebDriver
driver.quit()



In [11]:
df2

Unnamed: 0,Title,Year,Duration,Age Category,IMDb Rating,Votes,Metascore
0,1. Pulp Fiction,1994,2h 34m,R,8.9,(2.3M),95
1,2. The Amazing Spider-Man 2,2014,2h 22m,PG-13,6.6,(559K),53
2,3. The Shawshank Redemption,1994,2h 22m,R,9.3,(3M),82
3,4. Star Wars: Episode IV - A New Hope,1977,2h 1m,PG,8.6,(1.5M),90
4,5. Back to the Future,1985,1h 56m,PG,8.5,(1.4M),87
...,...,...,...,...,...,...,...
6349,9810. Pee-wee's Big Holiday,2015,1h 53m,R,5.9,(151K),41
6350,9811. London Fields,2014,2h 5m,R,6.4,(11K),23
6351,9812. Untitled Marion Barry Biopic,2015,2h 1m,R,5.3,(153K),38
6352,9813. One Last Thing Before I Go,2014,2h 5m,PG,7.1,(129K),77


In [13]:
df2.duplicated().sum()

0

In [63]:
df.to_csv("imdb_movie_list2.csv", index=False)

In [17]:
df2.to_csv("imdb_movie_list3.csv", index=False)

In [10]:
movie = pd.read_csv(r"C:\Users\rahaf\Camp1\Movies\imdb_movie_list2.csv")

In [12]:
# Assuming 'movie' is your DataFrame
movie["Title"] = movie["Title"].str.replace(r"^\d+\.\s*", "", regex=True)

# Display the first few rows to verify
print(movie.head())


                                           Title  Year Duration Age Category  \
0                                  The Last Song  2010   1h 47m           PG   
1  Legend of the Guardians: The Owls of Ga'Hoole  2010   1h 37m           PG   
2                                     The Losers  2010   1h 37m        PG-13   
3                                      MacGruber  2010   1h 30m            R   
4                                       Repo Men  2010   1h 51m            R   

   IMDb Rating   Votes  Metascore  
0          6.0   (94K)         33  
1          6.9   (89K)         53  
2          6.2  (109K)         44  
3          5.6   (48K)         43  
4          6.3  (112K)         32  


In [24]:
movie.head(30)

Unnamed: 0,Title,Year,Duration,Age Category,IMDb Rating,Votes,Metascore
0,The Last Song,2010,1h 47m,PG,6.0,(94K),33
1,Legend of the Guardians: The Owls of Ga'Hoole,2010,1h 37m,PG,6.9,(89K),53
2,The Losers,2010,1h 37m,PG-13,6.2,(109K),44
3,MacGruber,2010,1h 30m,R,5.6,(48K),43
4,Repo Men,2010,1h 51m,R,6.3,(112K),32
5,Skyline,2010,1h 32m,PG-13,4.5,(93K),26
6,Takers,2010,1h 47m,PG-13,6.2,(67K),45
7,When in Rome,2010,1h 31m,PG-13,5.5,(66K),25
8,Youth in Revolt,2009,1h 30m,R,6.4,(77K),63
9,The Debt,2010,1h 53m,R,6.8,(73K),65


In [42]:
movie["Year"].unique()

array(['2010', '2009', '2011', '2012', '2013', '2006', '2007', '1999',
       '2002', '2014', '1974', '1973', '1976', '1977', '1979', '1982',
       '1983', '1985', '1987', '1989', '1990', '1991', '1993', '1994',
       '1995', '1996', '1997', '1998', '2000', '2001', '2003', '2004',
       '2005', '2008', '1981', '1975', '1978', '1984', '1986', '1988',
       '1992', '1980', '1972', '2024', '2016', '2015', '1971', '2017',
       '1970', '1967–2003', '2018', '2022', '2023', '2019', '2021'],
      dtype=object)

In [46]:
ind=movie[movie["Year"]=='1967–2003'].index
movie.drop(ind, inplace=True)

In [48]:
movie.shape

(5316, 7)

In [50]:
movie["Year"] = movie["Year"].astype(int)

In [52]:
movie["Votes"].unique()

array(['(94K)', '(89K)', '(109K)', '(48K)', '(112K)', '(93K)', '(67K)',
       '(66K)', '(77K)', '(73K)', '(58K)', '(76K)', '(70K)', '(49K)',
       '(7.8K)', '(110K)', '(46K)', '(41K)', '(53K)', '(879K)', '(17K)',
       '(9.7K)', '(5.4K)', '(21K)', '(12K)', '(9.4K)', '(16K)', '(26K)',
       '(29K)', '(69K)', '(35K)', '(32K)', '(65K)', '(50K)', '(45K)',
       '(101K)', '(96K)', '(195K)', '(80K)', '(63K)', '(92K)', '(9.3K)',
       '(34K)', '(24K)', '(20K)', '(103K)', '(411K)', '(168K)', '(175K)',
       '(144K)', '(177K)', '(224K)', '(283K)', '(115K)', '(167K)',
       '(287K)', '(40K)', '(104K)', '(100K)', '(189K)', '(234K)', '(51K)',
       '(169K)', '(173K)', '(106K)', '(157K)', '(161K)', '(225K)',
       '(201K)', '(265K)', '(111K)', '(152K)', '(202K)', '(131K)',
       '(78K)', '(208K)', '(217K)', '(164K)', '(238K)', '(184K)',
       '(117K)', '(128K)', '(361K)', '(185K)', '(194K)', '(61K)',
       '(248K)', '(187K)', '(148K)', '(140K)', '(159K)', '(97K)',
       '(146K)', '(13

In [54]:
def convert_votes(vote):
    vote = vote.strip('()')  # Remove parentheses
    if vote.startswith('-'):  # If the value starts with a minus sign, return 0
        return 0
    elif 'K' in vote:  # If the value contains 'K', multiply by 1000
        return int(float(vote.replace('K', '').strip()) * 1000)
    elif 'M' in vote:  # If the value contains 'M', multiply by 1,000,000
        return int(float(vote.replace('M', '').strip()) * 1000000)
    else:
        return int(vote)  # If no 'K' or 'M', return the number as is

# Apply the function to the 'Votes' column
movie['Votes'] = movie['Votes'].apply(convert_votes)

In [56]:
movie.dtypes

Title            object
Year              int32
Duration         object
Age Category     object
IMDb Rating     float64
Votes             int64
Metascore         int64
dtype: object

In [58]:
movie.head(30)

Unnamed: 0,Title,Year,Duration,Age Category,IMDb Rating,Votes,Metascore
0,The Last Song,2010,1h 47m,PG,6.0,94000,33
1,Legend of the Guardians: The Owls of Ga'Hoole,2010,1h 37m,PG,6.9,89000,53
2,The Losers,2010,1h 37m,PG-13,6.2,109000,44
3,MacGruber,2010,1h 30m,R,5.6,48000,43
4,Repo Men,2010,1h 51m,R,6.3,112000,32
5,Skyline,2010,1h 32m,PG-13,4.5,93000,26
6,Takers,2010,1h 47m,PG-13,6.2,67000,45
7,When in Rome,2010,1h 31m,PG-13,5.5,66000,25
8,Youth in Revolt,2009,1h 30m,R,6.4,77000,63
9,The Debt,2010,1h 53m,R,6.8,73000,65
