# Webscraping Bollywood songs from 1947-2018

We'll be scraping https://www.lyricsia.com/

## Phase 1

In [1]:
import requests
import bs4 as bs
import requests
import csv
import time

In [2]:
# Opening the csv file in write mode and adding a header
with open(r'lyricsia_phase1.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['SNo', 'Songpage_URL', 'Movie'])

In [3]:
year_page_url = 'https://www.lyricsia.com/hindi-songs/'
root_url = 'https://www.lyricsia.com'

In [4]:
%%time
rowcount = 0

# The category pages are ordered and range from 1947 to 2018
for year in range(1947, 2019):
    
    url = year_page_url + str(year)
    
    try:
        # Fetching the page
        source = requests.get(url)
        
        # Parsing the html
        webpage = bs.BeautifulSoup(source.content, features='html.parser') 
    
        listofsongs = []
        for index, row in enumerate(webpage.findAll('tr')):
            # The first row contains the header
            if index != 0:
                songurl = root_url + row.select('td')[1].a['href'].split(';')[0]
                moviename = row.select('td')[2].text
                rowcount += 1
                
                listofsongs.append([rowcount, songurl, moviename])
                
        # Opening the file in append mode and adding the data for the entire page i.e. each year
        with open(r'lyricsia_phase1.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerows(listofsongs)
            
        # Printing the progress
        print(f'Data Scraped for year: {year}')
    except Exception as e:
        print(e)

Data Scraped for year: 1947
Data Scraped for year: 1948
Data Scraped for year: 1949
Data Scraped for year: 1950
Data Scraped for year: 1951
Data Scraped for year: 1952
Data Scraped for year: 1953
Data Scraped for year: 1954
Data Scraped for year: 1955
Data Scraped for year: 1956
Data Scraped for year: 1957
Data Scraped for year: 1958
Data Scraped for year: 1959
Data Scraped for year: 1960
Data Scraped for year: 1961
Data Scraped for year: 1962
Data Scraped for year: 1963
Data Scraped for year: 1964
Data Scraped for year: 1965
Data Scraped for year: 1966
Data Scraped for year: 1967
Data Scraped for year: 1968
Data Scraped for year: 1969
Data Scraped for year: 1970
Data Scraped for year: 1971
Data Scraped for year: 1972
Data Scraped for year: 1973
Data Scraped for year: 1974
Data Scraped for year: 1975
Data Scraped for year: 1976
Data Scraped for year: 1977
Data Scraped for year: 1978
Data Scraped for year: 1979
Data Scraped for year: 1980
Data Scraped for year: 1981
Data Scraped for yea

## Phase 2

In [5]:
import requests
import bs4 as bs
import requests
import csv
import time
import pandas as pd

In [6]:
# Opening the data scraped in phase 1
df = pd.read_csv('lyricsia_phase1.csv')

In [7]:
df.head()

Unnamed: 0,SNo,Songpage_URL,Movie
0,1,https://www.lyricsia.com/lyricid/5902/yahaan-b...,Jugnu
1,2,https://www.lyricsia.com/lyricid/5098/chand-si...,Elan
2,3,https://www.lyricsia.com/lyricid/5050/wo-apani...,Jugnu
3,4,https://www.lyricsia.com/lyricid/5048/yahan-ba...,Jugnu
4,5,https://www.lyricsia.com/lyricid/5082/khayegi-...,Mirza Sahiban


In [8]:
tableheaders = ['Song', 'Lyrics', 'Movie', 'Year', 'Singer', 'Writer', 'Composer']

In [9]:
# Creating a new file by opening it in write mode
with open(r'lyricsia_phase2.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(tableheaders)

In [None]:
%%time

# Each row item contains the link to a song page
for i in range(len(df)):
    # Extracting the song page url from the row item
    songurl = df['Songpage_URL'][i]
    try:      
        
        # Getting the song page and parsing the html
        source = requests.get(songurl)
        webpage = bs.BeautifulSoup(source.content, features='html.parser') 

        # Finding various information about the song from the html 
        song = webpage.find('h1').text.split('Lyrics')[0]
        
        lyrics = webpage.find('pre').text
        
        metacontainer = webpage.find('div', attrs={'class':'col-sm-4 col-md-4'})
        
        movie = metacontainer.select('p')[1].text.split(':')[1].lstrip()
        singer = metacontainer.select('p')[2].text.split(':')[1].lstrip()
        writer = metacontainer.select('p')[3].text.split(':')[1].lstrip()
        composer = metacontainer.select('p')[4].text.split(':')[1].lstrip()
        year = metacontainer.select('p')[7].text.split(':')[1].lstrip()
        
        songdata = [song, lyrics, movie, year, singer, writer, composer]
        
        # Writing each row of song with its lyrics to csv file
        # Note that the file was opened in append mode
        with open(r'lyricsia_phase2.csv', 'a') as file:
            writer = csv.writer(file)
            writer.writerow(songdata)
            
        # Printing progress
        print(f'Added song {i+1}/{len(df)}: {song}')
        
    except Exception as e:
        print(e)

Added song 1/7363: Yahaan Badala Wafa Ka 
Added song 2/7363: Chand Si Soorat 
Added song 3/7363: Wo Apani Yaad Dilaane Ko 
Added song 4/7363: Yahan Badla Wafaa Ka 
Added song 5/7363: Khayegi Thokare Ye Jawani 
Added song 6/7363: Jai Krisna Hare Shre Krishna Hare 
Added song 7/7363: Hamare Angana Aaj Baje Shehnai 
Added song 8/7363: Betaab Hai Dil Dard 
Added song 9/7363: Aaja Tujhe Afsana 
Added song 10/7363: Chale Dil Ki Duniya Barbaad Kar Ke 
Added song 11/7363: Humdard Ka Afsana 
Added song 12/7363: Meri Jaan Sunday Ke Sunday 
Added song 13/7363: Umangen Dil Ki Machli Muskuraye Zindagi Apni 
Added song 14/7363: Mera Sundar Sapna Beet Gaya 
Added song 15/7363: Desh Ki Purkaif Rangi Si Fizao Me Kahi 
Added song 16/7363: Maar Katari Mar Jaana 
Added song 17/7363: Chhuk Chhuk Chhaiya Chhaiya 
Added song 18/7363: Allah Nigehbaan Tera 
Added song 19/7363: Ha Bachh Ke Rehna Ji 
Added song 20/7363: Kya Yehi Tera Pyar Tha 
Added song 21/7363: Insaan Ki Tehzeeb Pe 
Added song 22/7363: Ye Kaun