<a href="https://colab.research.google.com/github/GiridharIITM/alumni_pathfinder/blob/main/extract_alumni_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import bs4 as bs
import urllib.request
 
# Getting soup object
source = urllib.request.urlopen('https://www.alumni.iitb.ac.in/en/award-list/distinguished-alumnus').read()
soup = bs.BeautifulSoup(source,'lxml')

In [None]:
name_list = []
qualification_list = []
urls_list = []
 
for bsname in soup.find_all('td', class_='views-field views-field-title'):
  name_list.append(bsname.text.strip())
  urls_list.append(bsname.a.get('url'))
 
for bsqual in soup.find_all('td', class_='views-field views-field-field-iitb-passingout'):
  qualification_list.append(bsqual.text.strip())
 
next_page_url = soup.find('a', title='Go to next page').get('href')

In [None]:
urls_list

[None, None, None, None, None, None, None, None, None, None]

Now that we have soup object, will get data and store into pandas data frame.


Description of data:

Name : alumni_history

Columns:
* name: 
* education_start:
* education_institution:
* education_branch:
* education_start:
* education_end: 
* position_company:
* position_title:
* position_pay:
* position_start:
* position_end:


In [None]:
print(soup.find_all('p'))

[<p><a href="https://acr.iitb.ac.in/gift_student_ITsupport_onlinelearning/" target="_blank" title="Supporting Students with IT Hardware for Online Learning at IIT Bombay">Supporting Students with IT Hardware for Online Learning at IIT Bombay</a></p>, <p class="copyright">© Copyright IIT Bombay 2014</p>]


In [69]:
import pandas as pd
import os, sys

alumni_cols = ['id','name',
               'education_start','education_end','education_inst','education_degree','education_branch','education_result',
               'position_start','position_end','position_employer','position_title','position_pay', 'position_workhours',
               'urls_list', 'linkedIn_url'
               ]

BASE_DIR = os.curdir
FILE_DIR = "textfiles"
os.makedirs(os.path.join(BASE_DIR,FILE_DIR), exist_ok=True)

class alumni_extractor:

  def __init__(self, base_url, alumni_url):
    self.base_url = base_url
    self.alumni_url = alumni_url
    self.alumni_history = pd.DataFrame(columns = alumni_cols)
    
  def build_soup(self, url):
    source = urllib.request.urlopen(url).read()
    soup = bs.BeautifulSoup(source,'lxml')
    return soup

  # This function actually parses the website, and stores information. 
  # Does a bare minimum population of the alumni_history dataframe.
  # Will be different for different institutes
  def build_data(self):
    pass

  # This function is for NLP functionality
  def enrich_data(self):    
    pass

class iitb_extractor(alumni_extractor):
  def build_data(self):
    next_page_url = self.alumni_url
    max_new_pages = 3
    num_new_pages = 0
    hash_list = []
    name_list = []
    file_list = []
    urls_list = []
    edst_list = []
    ednd_list = []
    eddg_list = []
    edin_list = []
    edbr_list = []
    while(not next_page_url is None):
      soup = self.build_soup(next_page_url)
      for bsname in soup.find_all('td', class_='views-field views-field-title'):
        name = bsname.text.strip()
        id = hash(name) % ((sys.maxsize + 1) * 2)
        url = self.base_url + bsname.a.get('href')
        hash_list.append(id)
        name_list.append(name)
        urls_list.append([url])
        file_list.append(self.build_file(str(id), url))
      for bsqual in soup.find_all('td', class_='views-field views-field-field-iitb-passingout'):
        qual = bsqual.text.strip()
        degree, yr_of_completion, branch = qual.split(', ')
        ednd_list.append([yr_of_completion])
        eddg_list.append([degree])
        edin_list.append(['IIT Bombay'])
        edbr_list.append([branch])
      next_page_url = soup.find('a', title='Go to next page')
      num_new_pages += 1
      if(num_new_pages < max_new_pages and not next_page_url is None):
        next_page_url = self.base_url + next_page_url.get('href')
      else:
        next_page_url = None
    self.alumni_history['id'] = hash_list
    self.alumni_history['name'] = name_list
    self.alumni_history['file_list'] = file_list
    self.alumni_history['urls_list'] = urls_list
    self.alumni_history['education_end'] = ednd_list
    self.alumni_history['education_degree'] = eddg_list
    self.alumni_history['education_inst'] = edin_list
    self.alumni_history['education_branch'] = edbr_list

  def build_file(self, name, url):
    soup = self.build_soup(url)
    fname = os.path.join(BASE_DIR, FILE_DIR, name+'.txt')
    textlines = []
    for section in soup.find_all('div', class_='field-items'):
      textlines.append(section.text.strip() + "\n")
    with open(fname, mode='w') as f:
      f.writelines(textlines)
    return(fname)


In [70]:
iitb = iitb_extractor('https://www.alumni.iitb.ac.in/','https://www.alumni.iitb.ac.in/en/award-list/distinguished-alumnus')
iitb.build_data()


In [71]:
iitb.alumni_history

Unnamed: 0,id,name,education_start,education_end,education_inst,education_degree,education_branch,education_result,position_start,position_end,position_employer,position_title,position_pay,position_workhours,urls_list,linkedIn_url,file_list
0,1047561575613365510,Dr. Mayuresh V. Kothare,,[1991],[IIT Bombay],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/1047561575613365510.txt
1,10947462743544533938,Mr. Abidali Z. Neemuchwala,,[1992],[IIT Bombay],[M.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/10947462743544533938.txt
2,14523065976710863416,Mr. Surendra Murlidhar Vaidya,,[1983],[IIT Bombay],[B.Tech.],[Metallurgical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/14523065976710863416.txt
3,12972712972911913078,Prof. Gaurav S. Sukhatme,,[1991],[IIT Bombay],[B.Tech.],[Computer Science and Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/12972712972911913078.txt
4,11948075183958400738,Prof. Janat Shah,,[1980],[IIT Bombay],[B.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/11948075183958400738.txt
5,15267884567679417594,Prof. Kavita Ramanan,,[1992],[IIT Bombay],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/15267884567679417594.txt
6,5597221031489067749,Prof. Varadarajan V. Chari,,[1974],[IIT Bombay],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/5597221031489067749.txt
7,10408473595204815277,Dr. Ajei Gopal,,[1982],[IIT Bombay],[B.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/10408473595204815277.txt
8,16350226210890559013,Dr. Lalitesh Katragadda,,[1990],[IIT Bombay],[B.Tech.],[Aerospace Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/16350226210890559013.txt
9,17860445187742847796,Dr. Shashidhar Thakur,,[1990],[IIT Bombay],[B.Tech.],[Computer Science and Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/17860445187742847796.txt


# New Section

In [None]:
soup = iitb.build_soup(iitb.alumni_history.urls_list[0][0])

In [None]:
for section in soup.find_all('div', class_='field-items'):
  print(section.text)

Distinguished Alumnus
2020

B.Tech., 1991, Chemical Engineering
Dr. Mayuresh Kothare is Chairman, and R. L. McCann Professor of Chemical Engineering and Bioengineering at Lehigh University, United States.
 
Dr. Kothare earned his B.Tech degree in Chemical Engineering from IIT Bombay in 1991 with a silver medal, M.S. and Ph.D. degrees in chemical engineering from the California Institute of Technology, Pasadena, CA, in 1995 and 1997, respectively.
 
He was a Research Assistant with the Swiss Federal Institute of Technology (ETH), Zurich, Switzerland during 1994-96; a Visiting Scholar in electrical engineering at Purdue University (1995) and chemical engineering at the City College of New York, New York (1997); and a Postdoctoral Researcher with Mobil Oil Corporation (1997-98). His primary professional experience began as Assistant Professor of Chemical Engineering at Lehigh University (1998-2003), Associate Professor (2003-08), Professor (2008-12) and Chairman of the Department of Chemi

In [None]:
iitb.alumni_history

Unnamed: 0,id,name,education_start,education_end,education_inst,education_degree,education_branch,education_result,position_start,position_end,position_employer,position_title,position_pay,position_workhours,urls_list,linkedIn_url,file_list
0,1047561575613365510,Dr. Mayuresh V. Kothare,,[1991],[IITB],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/1047561575613365510.txt
1,10947462743544533938,Mr. Abidali Z. Neemuchwala,,[1992],[IITB],[M.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/10947462743544533938.txt
2,14523065976710863416,Mr. Surendra Murlidhar Vaidya,,[1983],[IITB],[B.Tech.],[Metallurgical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/14523065976710863416.txt
3,12972712972911913078,Prof. Gaurav S. Sukhatme,,[1991],[IITB],[B.Tech.],[Computer Science and Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/12972712972911913078.txt
4,11948075183958400738,Prof. Janat Shah,,[1980],[IITB],[B.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/11948075183958400738.txt
5,15267884567679417594,Prof. Kavita Ramanan,,[1992],[IITB],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/15267884567679417594.txt
6,5597221031489067749,Prof. Varadarajan V. Chari,,[1974],[IITB],[B.Tech.],[Chemical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2020...,,./textfiles/5597221031489067749.txt
7,10408473595204815277,Dr. Ajei Gopal,,[1982],[IITB],[B.Tech.],[Mechanical Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/10408473595204815277.txt
8,16350226210890559013,Dr. Lalitesh Katragadda,,[1990],[IITB],[B.Tech.],[Aerospace Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/16350226210890559013.txt
9,17860445187742847796,Dr. Shashidhar Thakur,,[1990],[IITB],[B.Tech.],[Computer Science and Engineering],,,,,,,,[https://www.alumni.iitb.ac.in//en/awards/2019...,,./textfiles/17860445187742847796.txt
