In [1]:
import requests
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np, pandas as pd
import seaborn as sns

In [2]:
pages = np.arange(1, 2801, 50) 
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin

#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
votes = []
directors= []
stars= []

for page in pages:
  
   #get request for movie
    response = requests.get("https://www.imdb.com/search/title/?title_type=feature,tv_movie,tv_series,tv_miniseries,documentary,short&release_date=2000-01-01,2022-12-31&countries=id&languages=id&"
                  + "start="
                  + str(page)
                  + "&ref_=adv_nxt", headers=headers)
    
    sleep(randint(8,15))
   
   #throw warning for status codes that are not 200
    if response.status_code != 200:
       warn('Request: {}; Status code: {}'.format(requests, response.status_code))

   #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:

       #conditional for all with metascore
       if container.find('div', class_ = 'ratings-metascore') is not None:
            #title
            title = container.h3.a.text
            titles.append(title)
            
            if container.h3.find('span', class_= 'lister-item-year text-muted unbold') is not None:
                #year released
                year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text # remove the parentheses around the year and make it an integer
                years.append(year)
                else:
                    years.append(None) # each of the additional if clauses are to handle type None data, replacing it with an empty string so the arrays are of the same length at the end of the scraping

            if container.p.find('span', class_ = 'certificate') is not None:
            
                #rating
                rating = container.p.find('span', class_= 'certificate').text
                ratings.append(rating)

            else:
                ratings.append("")

            if container.p.find('span', class_ = 'genre') is not None:
            
                #genre
                genre = container.p.find('span', class_ = 'genre').text.replace("\n", "").rstrip().split(',') # remove the whitespace character, strip, and split to create an array of genres
                genres.append(genre)
          
            else:
                genres.append("")

            if container.p.find('span', class_ = 'runtime') is not None:

                #runtime
                time = int(container.p.find('span', class_ = 'runtime').text.replace(" min", "")) # remove the minute word from the runtime and make it an integer
                runtimes.append(time)

            else:
                runtimes.append(None)

            if float(container.strong.text) is not None:

                #IMDB ratings
                imdb = float(container.strong.text) # non-standardized variable
                imdb_ratings.append(imdb)

            else:
                imdb_ratings.append(None)

            if container.find('span', class_ = 'metascore').text is not None:

                #Metascore
                m_score = int(container.find('span', class_ = 'metascore').text) # make it an integer
                metascores.append(m_score)

            else:
                metascores.append(None)

            if container.find('span', attrs = {'name':'nv'})['data-value'] is not None:

                #Number of votes
                vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
                votes.append(vote)

            else:
                votes.append(None)

            else:
                votes.append(None)

In [3]:
print(len(titles), len(years), len(ratings), len(genres), len(runtimes), len(imdb_ratings), len(votes), len(directors), len(stars))

2777 2777 2777 2777 2777 2777 2777 2777 2777


In [11]:
movie_df = pd.DataFrame({'Title': titles,
                         'Year': years,
                         'Rated': ratings,
                         'Genre': genres,
                         'Runtime': runtimes,
                         'Rating': imdb_ratings,
                         'Votes': votes,
                         'Director': directors,
                         'Star': stars}
                      )
# movie_df = pd.DataFrame.from_dict(a, orient='index')
# movie_df = df.transpose()
movie_df.loc[:, 'Year'] = movie_df['Year'].str[-5:-1]
final_df = movie_df.loc[movie_df['Year'] != 'ovie'] # One small issue with the scrape on these two movies so just dropping those ones.
# Dropping
# final_df = movie_df[movie_df['Year'].str.contains("19- ") == False]

# final_df.loc[:, 'Year'] = pd.to_numeric(final_df['Year'])
final_df['Year']= pd.to_numeric(final_df.Year, errors='coerce').fillna(2000).astype(int)
final_df = final_df.replace(r'\n',' ', regex=True) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
result = final_df.dtypes
print (result)

Title        object
Year          int32
Rated        object
Genre        object
Runtime      object
Rating      float64
Votes        object
Director     object
Star         object
dtype: object


In [16]:
final_df.sample(20)

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Star
317,Sunshine Becomes You,2015,-,"Comedy, Drama, Romance",126 min,6.3,95.0,Rocky Soraya,"[Herjunot Ali, Sam Brodie, Nabilah Ratna Ayu A..."
95,Juara,2016,13+,Action,106 min,8.2,31.0,Charles Gozali,"[Bisma Karisma, Ciccio Manassero, Cecep Arif R..."
1199,Roh,2007,-,Horror,,5.0,15.0,Atok Suharto,"[Angel Bella, Ryan Delon, Zaenal Abidin Domba,..."
1368,Di bawah langit,2010,-,Drama,90 min,,,Opick,"[Inneke Koesherawati, Agus Kuncoro, Dolly Mart..."
166,Antares,2022,Approved,"Action, Drama, Romance",30 min,6.6,2381.0,Angga Yunanda,"[Beby Tsabina, Irzan Faiq, Maudy Effrosina]"
1397,7 Hari 24 Jam,2014,-,"Drama, Romance",97 min,6.9,99.0,Fajar Nugros,"[Lukman Sardi, Dian Sastrowardoyo, Henky Solai..."
303,Malapataka,2020,-,"Short, Horror",,7.7,150.0,Rizal Mantovani,"[Sonia Alyssa, Masayu Anastasia, Tissa Biani A..."
89,Pretty Boys,2019,13+,"Comedy, Drama",100 min,7.1,493.0,Tompi,"[Vincent Ryan Rompies, Deddy Mahendra Desta, D..."
2725,Deadlock,2015,-,"Short, Action",2 min,,,Jossen Eliata Thenario,"[Lesley Kurniawan, Denilson Susanto]"
1583,MLD Spot: Stage Bus Jazz Tour,2022,-,"Comedy, Drama",,,,Ali Seggaf,"[Windy Apsari, Khiva Iskak, Ruth Marini]"


In [31]:
final_df.isnull().sum()

Title          0
Year           0
Rated          0
Genre         30
Runtime      638
Rating      1159
Votes       1159
Director      22
Star           0
dtype: int64

In [42]:
final_df.sample(20)

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Star
1758,Punchline! : Surat untuk Pak Raam,2012,-,"Short, Comedy",,,,Adhyatmika,"[Rangga Djoned, Muhammad Fahri, Audevian Monda..."
1708,Nazar,2009,-,Comedy,,2.8,7.0,Sofyan D. Surza,"[Jessica Iskandar, Ben Joshua, Renata Kusmanto..."
1605,Hantu Cantik Kok Ngompol,2016,-,Horror,,,,Emil G. Hampp,"[Sarah Azhari, Nana Mirdad]"
31,Stealing Raden Saleh,2022,TV-14,"Action, Drama",154 min,8.1,958.0,Angga Dwimas Sasongko,"[Iqbaal Dhiafakhri Ramadhan, Angga Yunanda, Ra..."
2185,Joko & Putra,2022,-,"Documentary, Short",15 min,,,Jeissy Trompiz,[]
1857,Love like the Falling Rain,2020,TV-14,"Adventure, Drama, Fantasy",86 min,4.7,206.0,Lasja Fauzia,"[Jefri Nichol, Aurora Ribero, Axel Matthew Tho..."
1445,Anda Puas Saya Loyo,2008,-,Comedy,82 min,2.9,8.0,K.K. Dheeraj,"[Komeng, Ruben Onsu, Bedu, Ryan Syehan]"
1387,19 Letters,2000,-,"Drama, Music",,8.0,8.0,Diandra Agatha,"[Rifqi Sukmoutomo, Syifa Salsabila, Vidan Mart..."
1547,Monographs,2020,-,,152 min,,,Kush Badhwar,"[Saodat Ismailova, Maja Korbecka, Raya Martin,..."
2616,Shoes,2018,-,"Short, Drama",7 min,,,Richard James Halstead,"[Irfan Thamrin, Puja Astawa, Devy Gita, Indra ..."


In [45]:
final_df['Stars'] = [', '.join(map(str, l)) for l in final_df['Star']]

In [46]:
final_df.sample(10)

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Star,Stars
1219,Once Upon a Time in Indonesia,2020,17+,"Action, Crime, Drama",159 min,2.4,27.0,Asun Mawardi,"[Franki Darmawan, Djaitov Tigor, Ryana Dea, Ma...","Franki Darmawan, Djaitov Tigor, Ryana Dea, Mar..."
982,Pengantin Pantai Biru,2010,D,Horror,74 min,2.6,30.0,Nayato Fio Nuala,"[Cathrine Wilson, Keith Foo, Uli Auliani, Cynt...","Cathrine Wilson, Keith Foo, Uli Auliani, Cynth..."
1041,Denting Kematian,2020,TV-MA,"Horror, Mystery, Thriller",86 min,3.9,47.0,Rudy Soedjarwo,"[Brisia Jodie, Rangga Azof, Ayu Dyah Pasha, Sa...","Brisia Jodie, Rangga Azof, Ayu Dyah Pasha, San..."
2772,Joni Sok Jagoan,2016,-,"Short, Comedy",6 min,,,Mustafa Mustafa,[],
2154,Minor,2019,-,"Documentary, Short",39 min,,,Vena Besta Klaudina,"[Takziyatun Nufus, Vena Besta Klaudina]","Takziyatun Nufus, Vena Besta Klaudina"
2153,Ojike,2020,-,"Short, Horror",5 min,,,Yudhistira Bayu,"[Rangga Eka Nanda, Andhika Dwi Prakoso]","Rangga Eka Nanda, Andhika Dwi Prakoso"
1630,Night and Day,2021,-,"Short, Drama, Thriller",,,,Patrick Joshua,"[Andri Mashadi, Dimaz Andrean, Widiyashara S.,...","Andri Mashadi, Dimaz Andrean, Widiyashara S., ..."
2348,Ariel & Raja Langit,2005,-,"Adventure, Drama",,,,Harry Suharyadi,"[Sulton Max, Ariel Tatum, Indy Barens, Sissy P...","Sulton Max, Ariel Tatum, Indy Barens, Sissy Pr..."
2655,Bandung 25-03,2017,-,"Short, Crime, Mystery",1 min,,,Abdalah Gifar,[],
1242,Star Stealer,2020,-,"Comedy, Crime, Drama",30 min,8.2,6.0,Syafira Haddad,"[Roy Sungkono, Clairine Clay, Shenina Cinnamon]","Roy Sungkono, Clairine Clay, Shenina Cinnamon"


In [49]:
final_df_new = final_df.drop(['Star'], axis=1)

final_df_new.sample(10)

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Stars
1283,The Maling Kuburans,2009,-,Comedy,90 min,7.2,12.0,Dwi Ilalang,"Indra Birowo, Donita, Heri Savalas, Syahrini"
1321,Kembang perawan,2009,-,Comedy,90 min,,,Joko Nugroho,"Dimas Anggara, Kris Anjar, Ryana Dea, Adly Fayruz"
176,Patience is the test,2020,TV-14,"Adventure, Comedy, Drama",126 min,7.4,384.0,Anggy Umbara,"Vino G. Bastian, Luna Maya, Ananda Omesh, Este..."
469,The Man from the Sea,2018,13+,"Drama, Mystery",107 min,5.8,212.0,Kôji Fukada,"Dean Fujioka, Mayu Tsuruta, Taiga Nakano, Junk..."
32,Message Man,2018,21+,"Action, Crime, Thriller",91 min,5.9,5602.0,Corey Pearson,"Paul O'Brien, Aji Santosa, Verdi Solaiman, Mar..."
2648,Iman,2014,-,"Short, Comedy",4 min,,,Nurul Ibrahim,Jean Marais
953,Rumah Bekas Kuburan,2012,-,Horror,79 min,3.2,13.0,Irwan Siregar,"Fifi Buntaran, Julia Perez, Vikri Rahmat, Diah..."
471,What Lies Within,2018,TV-14,"Drama, Romance",108 min,7.2,102.0,Rudi Aryanto,"Dimas Anggara, Amanda Rawles, Maxime Bouttier,..."
1565,Rumah Malaikat,2016,-,Horror,,5.6,18.0,Billy Christian,"Mentari De Marelle, Agung Saga, Rowiena Umboh,..."
967,Angkerbatu,2007,-,Horror,,5.0,34.0,Jose Poernomo,"Mieke Amalia, Susilo Badar, Yama Carlos, Imelda"


In [65]:
stars_df = final_df_new.copy(deep=True)

def text_split(string, sep, index=None):
    '''
    Splits a string and returns a list.
    If an index value is specified, then returns that element.
    If the specified index is out of range, then returns "N/A".
    
    Keyword arguments:
    string -- The string to be splitted.
    sep -- The separator.
    index -- The 
    '''
    try:
        return string.split(sep)[index] if index or index == 0 else string.split(sep)
    except:
        return 'N/A'

    
splitcount = max(stars_df['Stars'].apply(lambda x: len(text_split(string=x, sep=', ')))) #Returns the maximum length of a movie genres' list to be used to define the amount of individual Genre columns to be created. 
for i in range(splitcount):
    stars_df[f'Star {i + 1}'] = stars_df['Stars'].apply(lambda x: text_split(string=x, sep=', ', index=i))

sorted_by_star_df = stars_df.sort_values(by=['Star 1', 'Star 2', 'Star 3', 'Star 4']) #Sorting the DataFrame by genre.
sorted_by_star_df

# remove star nan from star 5 to 42

df_star_new = sorted_by_star_df.drop(sorted_by_star_df.loc[:, 'Star 5':'Star 42'].columns, axis=1)
df_star_new = df_star_new.drop(['Stars'], axis=1)

df_star_new.head(10)


#sorted_by_star_df[['Stars','Star 1', 'Star 2', 'Star 3', 'Star 4']].sample(20)

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Star 1,Star 2,Star 3,Star 4
103,Kisah Tanah Jawa: Merapi,2000,18+,Horror,,7.8,37.0,,,,,
409,Negeri Dongeng,2017,-,Documentary,98 min,8.9,8.0,Anggi Frisca,,,,
502,Workingman's Death,2005,Not Rated,Documentary,122 min,7.9,1757.0,Michael Glawogger,,,,
585,Magic Hour,2019,-,Drama,,8.7,6.0,,,,,
593,Jakarta Love Story,2015,-,Romance,50 min,,,,,,,
735,Homebound,2022,-,"Documentary, Short",10 min,,,Ismail Fahmi Lubish,,,,
748,Cherrybelle: Chibi Chibi Burger,2000,-,Adventure,,,,,,,,
900,RONG (Hole),2018,-,"Short, Drama, Family",35 min,,,Sri Nugroho,,,,
909,Srimulat: Hil yang Mustahal - Babak Kedua,2022,-,"Biography, Comedy, Drama",,,,Fajar Nugros,,,,
933,Before You Eat,2022,18+,Documentary,96 min,,,Kasan Kurdi,,,,


In [64]:
genres_df = final_df_new.copy(deep=True)

def text_split(string, sep, index=None):
    '''
    Splits a string and returns a list.
    If an index value is specified, then returns that element.
    If the specified index is out of range, then returns "N/A".
    
    Keyword arguments:
    string -- The string to be splitted.
    sep -- The separator.
    index -- The 
    '''
    try:
        return string.split(sep)[index] if index or index == 0 else string.split(sep)
    except:
        return 'N/A'

    
splitcount = max(genres_df['Genre'].apply(lambda x: len(text_split(string=x, sep=', ')))) #Returns the maximum length of a movie genres' list to be used to define the amount of individual Genre columns to be created. 
for i in range(splitcount):
    genres_df[f'Genre {i + 1}'] = genres_df['Genre'].apply(lambda x: text_split(string=x, sep=', ', index=i))

sorted_by_genre_df = genres_df.sort_values(by=['Genre 1', 'Genre 2', 'Genre 3']) #Sorting the DataFrame by genre.
sorted_by_genre_df

Unnamed: 0,Title,Year,Rated,Genre,Runtime,Rating,Votes,Director,Stars,Genre 1,Genre 2,Genre 3
71,212 Warrior,2018,TV-14,"Action, Adventure, Comedy",123 min,6.9,1533,Angga Dwimas Sasongko,"Vino G. Bastian, Yayan Ruhian, Fariz Alfarazi,...",Action,Adventure,Comedy
142,Comic 8: Casino Kings Part 1,2015,13+,"Action, Adventure, Comedy",104 min,6.1,397,Anggy Umbara,"Hannah Al Rashid, Donny Alamsyah, Dhea Ananda,...",Action,Adventure,Comedy
247,Ashiap Man,2022,TV-14,"Action, Adventure, Comedy",102 min,,,Atta Halilintar,"Herdanius Larobu, Atta Halilintar, Aurel Herma...",Action,Adventure,Comedy
325,Rafathar,2017,13+,"Action, Adventure, Comedy",91 min,1.4,961,Bounty Umbara,"Rafathar Malik Ahmad, Raffi Ahmad, Babe Cabita...",Action,Adventure,Comedy
611,Comic 8: Casino Kings Part 2,2016,13+,"Action, Adventure, Comedy",95 min,5.8,372,Anggy Umbara,"Hannah Al Rashid, Donny Alamsyah, Ence Bagus, ...",Action,Adventure,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...
2297,Tato,2003,-,,120 min,,,Hanny Saputra,,,,
2300,Viva Indonesia,2001,-,,90 min,7.2,10,Ravi L. Bharwani,"Aryo Danusiri, Asep Kusdinar, Lianto Luseno, N...",,,
2301,D'Girlz Begins,2006,-,,,,,Tengku Firmansyah,"Andhika, Meriam Bellina, Dhena, Disa",,,
2415,Oi! Jaga Lambe,2019,-,,,,,,,,,


In [62]:
#save dataframe to excel
sorted_by_genre_df.to_excel('D:\Documents\Portofolio\linkedinJobs\imdb indonesia genre.xlsx', index = False)
sorted_by_genre_df.to_csv('D:\Documents\Portofolio\linkedinJobs\imdb indonesia genre.csv', index = False)
print("data successfully stored ") 

data successfully stored 
