In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://www.lse.ac.uk/study-at-lse/undergraduate/degree-programmes-2024/ba-history#:~:text=BA%20History%20at%20LSE%20is,world%20we%20live%20in%20today"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")

####  Cleaning the text

In [3]:
clean_text = soup.get_text(" ", strip=True)
clean_text = clean_text.replace("Browser does not support script","")

#### Importing required Packages

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")    # loading a pre_trained_model
doc = nlp(clean_text)

In [5]:
for sentence in doc.sents:
    print(sentence)

BA History .
Skip to content London School of Economics and Political Science Search Menu Study at LSE Toggle navigation Undergraduate Graduate Executive education Study Abroad Summer schools Online certificate courses International students Meet, visit and discover LSE Trending at LSE Virtual undergraduate open day Campus tours Students at LSE Blog Careers service Behavioural Lab: get involved as a research participant or assistant
Apply Got an admissions question?
Check our Admissions Knowledge Base Study at LSE Undergraduate Graduate Executive Education Online courses Summer School Study abroad Student life Accommodation Research Discover our research LSE Blogs Research for the World online magazine LSE Press Find LSE academics Research centres Understanding Global Politics Understanding the UK Economy Our REF 2021 results News & events Latest news Upcoming events Films & podcasts Media queries About LSE An introduction Our Campaign Our strategy LSE leadership Departments & institut

#### From these sentences we can lookup for degree_type

In [6]:
for sentence in doc.sents:
    if "undergraduate" in sentence.text.lower():
        degree_type = "undergraduate"
    elif "postgraduate" in sentence.text.lower():
        degree_type = "postgraduate"
        
degree_type

'undergraduate'

#### Now we can lookup for application_deadline

In [7]:
for sentence in doc.sents:
    if "application deadline" in sentence.text.lower():
        dates = []
        for ent in sentence.ents:
            if ent.label_ == "DATE":
                dates.append(ent)
                application_deadline = str(dates[-1])

application_deadline

'31 January 2024'

#### Next we need to lookup for duration_of_the_degree

In [10]:
from nltk.tokenize import word_tokenize

for sentence in doc.sents:
    if "years" in sentence.text.lower():
        tokens = word_tokenize(sentence.text)
        duration = tokens[:3]
        duration = ' '.join(duration)
        break
        
duration

'Three years full-time'

#### Now we need to find the fees for the overseas students

In [11]:
from nltk.tokenize import word_tokenize

for sentence in doc.sents:
    if "overseas students:" in sentence.text.lower():
            tokens = word_tokenize(sentence.text)
            fees = tokens[-2]
        
fees     

'£26,184'

#### Now we can lookup for the admission_eligibility/entri requirements

In [12]:
entri_requirements = " "
for sentence in doc.sents:
    if "gcse" in sentence.text.lower():
            entri_requirements +=sentence.text
            
entri_requirements

' Entry requirements Below we list our entry requirements in terms of GCSEs, A-Levels (the entry requirements should be read alongside our A-level subject combinations information) and the International Baccalaureate (IB) Diploma.GCSEs A strong set of GCSE grades including the majority at A (or 7) and A* (or 8-9) GCSE (or equivalent) English Language and Mathematics grades should be no lower than B (or 6)We also consider your overall GCSE subject profile A-levels*'

#### And finally we need to extract the degree_description

In [13]:
program_structure = " "
for sentence in doc.sents:
        if "lse100" in sentence.text.lower():
                program_structure += sentence.text
        if "first year" in sentence.text.lower():
                program_structure += sentence.text
        if "second year" in sentence.text.lower():
                program_structure += sentence.text
        if "third year" in sentence.text.lower():
                program_structure += sentence.text

In [14]:
degree_description = program_structure.replace("In addition, you will\xa0\u200balso\xa0take LSE100","")

In [15]:
degree_description

' BA History offers you a range of intellectual challenges: from broad survey courses, that look at particular themes or regions over a long period of time; to specialist courses that will develop your expertise in a topic, to the long essay or dissertation which allows you to pursue independent research in your third year.Programme structure and courses The degree involves studying courses to the value of 12 units over three years, plus LSE100.First year In the first year you will take two compulsory courses and two course options, chosen from three history courses, one further IR course, language courses and outside options. .Second year In the second year you will take a range of history courses from approved lists.Third year In your third year you will take three history options or two history options plus an approved outside option.In the QS World University History Subject Table for 2021, History at LSE ranked 5th overall in the world for the second year running, ahead of Columbi

#### Now we can store all the extracted data into a dictionary 

In [16]:
data = {
    "Degree Program" :["Degree type", "Application deadline","Duration of the degree","Admission eligibility/entry requirements",
                       "Overseas student fees","Brief degree description"],
    "Details" : [degree_type, application_deadline, duration, entri_requirements, fees, degree_description]
}

#### Now we can make a dataframe 

In [20]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
df = pd.DataFrame(data)
df

Unnamed: 0,Degree Program,Details
0,Degree type,undergraduate
1,Application deadline,31 January 2024
2,Duration of the degree,Three years full-time
3,Admission eligibility/entry requirements,"Entry requirements Below we list our entry requirements in terms of GCSEs, A-Levels (the entry requirements should be read alongside our A-level subject combinations information) and the International Baccalaureate (IB) Diploma.GCSEs A strong set of GCSE grades including the majority at A (or 7) and A* (or 8-9) GCSE (or equivalent) English Language and Mathematics grades should be no lower than B (or 6)We also consider your overall GCSE subject profile A-levels*"
4,Overseas student fees,"£26,184"
5,Brief degree description,"BA History offers you a range of intellectual challenges: from broad survey courses, that look at particular themes or regions over a long period of time; to specialist courses that will develop your expertise in a topic, to the long essay or dissertation which allows you to pursue independent research in your third year.Programme structure and courses The degree involves studying courses to the value of 12 units over three years, plus LSE100.First year In the first year you will take two compulsory courses and two course options, chosen from three history courses, one further IR course, language courses and outside options. .Second year In the second year you will take a range of history courses from approved lists.Third year In your third year you will take three history options or two history options plus an approved outside option.In the QS World University History Subject Table for 2021, History at LSE ranked 5th overall in the world for the second year running, ahead of Columbia, Stanford, Berkeley and Princeton."


#### Now we can save this as a csv file

In [24]:
df.to_excel("BA History program Details.xlsx", index = False)