In [30]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Fetching the data from IMDB website

In [32]:
# Define User-Agent header to mimic a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# Initialize lists to store scraped data
Rank = []
Movie_Name = []
IMDB_Rating = []

# Try to fetch data from the IMDb website
try:
    # Send a GET request to the IMDb website with specified headers
    source = requests.get("https://m.imdb.com/india/top-rated-indian-movies/", headers=headers)
    source.raise_for_status()  # Raise an exception for bad responses

    # Parse the HTML content of the webpage
    soup = BeautifulSoup(source.text, 'html.parser')

    # Find the section containing the list of top-rated Indian movies
    movies = soup.find('ul', class_="ipc-metadata-list ipc-metadata-list--dividers-all ipc-metadata-list--compact sc-3c7febe6-1 hPvgeC ipc-metadata-list--base")

    # Find all individual movie entries within the list
    movies = movies.find_all('li', class_="ipc-metadata-list__item ipc-metadata-list__item--inline ipc-metadata-list-item--link")

    # Iterate through each movie entry
    for movie in movies:
        # Extract the rank of the movie
        rank = movie.find('span', class_="sc-9910edf6-5 dsVoCd").text

        # Extract the combined data containing both rank and movie name
        data = movie.find('span', class_="sc-9910edf6-4 eyrnkD").text

        # Remove the rank from the combined data to get the movie name
        if rank in data:
            name = data.replace(rank, '', 1)

        # Extract the IMDb rating of the movie
        rating = movie.find('span', class_="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating").text

        # Append the extracted data to the respective lists
        Rank.append(rank)
        Movie_Name.append(name)
        IMDB_Rating.append(rating)

# Catch any exceptions that might occur during the execution of the try block
except Exception as e:
    print(e)


### Transforming IMDb data into a DataFrame structure

In [33]:
Top_movies={
            "Rank":Rank,
            "Movie_Name":Movie_Name,
            "IMDB_Rating":IMDB_Rating
           }

In [34]:
Top_Rated_Indian_movies=pd.DataFrame(Top_movies)

### Data Transformation

In [36]:
Top_Rated_Indian_movies.dtypes

Rank           object
Movie_Name     object
IMDB_Rating    object
dtype: object

In [37]:
Top_Rated_Indian_movies['Rank']=pd.to_numeric(Top_Rated_Indian_movies['Rank'])
Top_Rated_Indian_movies['IMDB_Rating']=pd.to_numeric(Top_Rated_Indian_movies['IMDB_Rating'])

In [38]:
Top_Rated_Indian_movies.dtypes

Rank             int64
Movie_Name      object
IMDB_Rating    float64
dtype: object

### Overview of Dataset

In [43]:
Top_Rated_Indian_movies.info

<bound method DataFrame.info of      Rank                           Movie_Name  IMDB_Rating
0       1                            12th Fail          9.2
1       2  Ramayana: The Legend of Prince Rama          9.2
2       3                              Nayakan          8.7
3       4                             Gol Maal          8.5
4       5           Rocketry: The Nambi Effect          8.7
..    ...                                  ...          ...
245   246                     Stanley Ka Dabba          7.8
246   247                           Happy Days          7.9
247   248                           Sonchiriya          7.9
248   249                 Nayak: The Real Hero          7.8
249   250             Hazaaron Khwaishein Aisi          7.9

[250 rows x 3 columns]>

In [39]:
Top_Rated_Indian_movies.head(10)

Unnamed: 0,Rank,Movie_Name,IMDB_Rating
0,1,12th Fail,9.2
1,2,Ramayana: The Legend of Prince Rama,9.2
2,3,Nayakan,8.7
3,4,Gol Maal,8.5
4,5,Rocketry: The Nambi Effect,8.7
5,6,Anbe Sivam,8.6
6,7,777 Charlie,8.8
7,8,Pariyerum Perumal,8.7
8,9,Manichitrathazhu,8.8
9,10,#Home,8.8


In [40]:
Top_Rated_Indian_movies.tail(10)

Unnamed: 0,Rank,Movie_Name,IMDB_Rating
240,241,RangiTaranga,8.2
241,242,Poove Unakkaga,8.6
242,243,Rise Roar Revolt,7.8
243,244,Angamaly Diaries,7.9
244,245,Vicky Donor,7.8
245,246,Stanley Ka Dabba,7.8
246,247,Happy Days,7.9
247,248,Sonchiriya,7.9
248,249,Nayak: The Real Hero,7.8
249,250,Hazaaron Khwaishein Aisi,7.9


In [51]:
Top_Rated_Indian_movies["IMDB_Rating"].describe().round(2)

count    250.00
mean       8.20
std        0.26
min        7.80
25%        8.00
50%        8.20
75%        8.30
max        9.20
Name: IMDB_Rating, dtype: float64