In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Base URL for LSE website
base_url = "https://www.lse.ac.uk"

# URL for the course list page
url = "https://www.lse.ac.uk/resources/calendar2016-2017/programmeRegulations/undergraduate/2016_outsideOptions.htm"

# Send a GET request to the URL
page = requests.get(url)
soup = BeautifulSoup(page.content, "lxml")

# Initialize a list to store course data
course_data = []

# Loop through all anchor tags with href attributes
for a in soup.find_all("a", href=True):
    link = a["href"]
    
    # Skip empty or malformed links
    if not link or link.startswith("#"):
        continue
    
    # Handle relative links using urljoin
    full_url = urljoin(base_url, link)
    
    # Look for course codes (the course code pattern is letters followed by numbers)
    name = a.get_text(strip=True)
    if len(name) >= 5 and name[:2].isalpha() and name[2:].isdigit():
        # Add course code and URL to the data list
        course_data.append({'Course Code': name, 'URL': full_url})

# Create a DataFrame from the course data
df_courses = pd.DataFrame(course_data)

# Remove duplicates, in case any course code is listed more than once
df_courses = df_courses.drop_duplicates(subset='Course Code')

# Set 'Course Code' as the index column
df_courses.set_index('Course Code', inplace=True)

# Display the DataFrame with course codes and URLs
print("Course DataFrame:")
print(df_courses)

# Function to extract prerequisites from a course page
def get_prerequisites(course_url):
    try:
        # Send a GET request to the course page
        page = requests.get(course_url)
        
        # If the URL returns a 404, return 0 as there are no prerequisites
        if page.status_code == 404:
            print(f"Page not found: {course_url}")
            return [0]
        
        soup = BeautifulSoup(page.content, "lxml")

        # Look for a possible prerequisite heading (commonly 'Prerequisite')
        prereq_section = soup.find(string=lambda text: 'prerequisite' in text.lower())
        
        if prereq_section:
            # Find the parent tag of the prereq section and try to extract course codes
            prereqs = []
            for link in prereq_section.find_parent().find_all('a', href=True):
                course_code = link.get_text(strip=True)
                # Check if the text is a valid course code (starts with letters and followed by numbers)
                if len(course_code) >= 5 and course_code[:2].isalpha() and course_code[2:].isdigit():
                    prereqs.append(course_code)
            
            # Return prerequisites if found, otherwise return [0]
            return prereqs if prereqs else [0]
        return [0]  # No prerequisites found
        
    except Exception as e:
        # Handle errors gracefully and return 0 in case of failure
        print(f"Error fetching prerequisites for {course_url}: {e}")
        return [0]


# Now, let's extract the prerequisites for each course from the dataframe
course_data_with_prereqs = []

# Loop through each course in the DataFrame
for index, row in df_courses.iterrows():
    course_code = index
    course_url = row['URL']
    
    # Get prerequisites for the course
    prerequisites = get_prerequisites(course_url)
    
    # Append the course data with prerequisites to the list
    for prereq in prerequisites:
        course_data_with_prereqs.append({'Course Code': course_code, 'Prerequisites': prereq})

# Create a new DataFrame with the course codes and their prerequisites
df_prereqs = pd.DataFrame(course_data_with_prereqs)

# Set 'Course Code' as the index column
df_prereqs.set_index('Course Code', inplace=True)

# Display the DataFrame with prerequisites
print("Course Prerequisites DataFrame:")
print(df_prereqs)

Course DataFrame:
                                                           URL
Course Code                                                   
AC100        https://www.lse.ac.uk/courseGuides/AC/2016_AC1...
AN100        https://www.lse.ac.uk/courseGuides/AN/2016_AN1...
AN101        https://www.lse.ac.uk/courseGuides/AN/2016_AN1...
AN102        https://www.lse.ac.uk/courseGuides/AN/2016_AN1...
EC100        https://www.lse.ac.uk/courseGuides/EC/2016_EC1...
...                                                        ...
ST308        https://www.lse.ac.uk/courseGuides/ST/2016_ST3...
ST327        https://www.lse.ac.uk/courseGuides/ST/2016_ST3...
AC104        https://www.lse.ac.uk/courseGuides/AC/2016_AC1...
GY240        https://www.lse.ac.uk/courseGuides/GY/2016_GY2...
MG303        https://www.lse.ac.uk/courseGuides/MG/2016_MG3...

[331 rows x 1 columns]
Page not found: https://www.lse.ac.uk/courseGuides/AC/2016_AC100.htm
Page not found: https://www.lse.ac.uk/courseGuides/AN/2016_AN100.htm
P

Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC303.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC307.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC309.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC310.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC311.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC313.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC315.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC317.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC319.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC321.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC325.htm
Page not found: https://www.lse.ac.uk/courseGuides/EC/2016_EC333.htm
Page not found: https://www.lse.ac.uk/courseGuides/EH/2016_EH204.htm
Page not found: https://www.lse.ac.uk/courseGuides/EH/2016_EH207.htm
Page not found: https://www.lse.ac

Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL278.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL284.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL293.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL295.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL300.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL301.htm
Page not found: https://www.lse.ac.uk/courseGuides/LL/2016_LL305.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA203.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA207.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA208.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA209.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA210.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA211.htm
Page not found: https://www.lse.ac.uk/courseGuides/MA/2016_MA212.htm
Page not found: https://www.lse.ac

In [26]:
# Function to extract prerequisites from a course page
def get_prerequisites(course_url):
    try:
        # Send a GET request to the course page
        page = requests.get(course_url)
        soup = BeautifulSoup(page.content, "lxml")

        # Look for a possible prerequisite heading (commonly 'Prerequisite')
        prereq_section = soup.find(string=lambda text: 'prerequisite' in text.lower())
        
        if prereq_section:
            # Find the parent tag of the prereq section and try to extract course codes
            prereqs = []
            for link in prereq_section.find_parent().find_all('a', href=True):
                course_code = link.get_text(strip=True)
                # Check if the text is a valid course code (starts with letters and followed by numbers)
                if len(course_code) >= 5 and course_code[:2].isalpha() and course_code[2:].isdigit():
                    prereqs.append(course_code)
            
            # Return prerequisites if found, otherwise return [0]
            return prereqs if prereqs else [0]
        return [0]  # No prerequisites found
        
    except Exception as e:
        # Handle errors gracefully and return 0 in case of failure
        print(f"Error fetching prerequisites for {course_url}: {e}")
        return [0]


# Now, let's extract the prerequisites for each course from the dataframe
course_data_with_prereqs = []

# Loop through each course in the DataFrame
for index, row in df_courses.iterrows():
    course_code = index
    course_url = row['URL']
    
    # Get prerequisites for the course
    prerequisites = get_prerequisites(course_url)
    
    # Append the course data with prerequisites to the list
    for prereq in prerequisites:
        course_data_with_prereqs.append({'Course Code': course_code, 'Prerequisites': prereq})

# Create a new DataFrame with the course codes and their prerequisites
df_prereqs = pd.DataFrame(course_data_with_prereqs)

# Set 'Course Code' as the index column
df_prereqs.set_index('Course Code', inplace=True)

# Display the DataFrame with prerequisites
print(df_prereqs)

             Prerequisites
Course Code               
AC100                    0
AN100                    0
AN101                    0
AN102                    0
EC100                    0
...                    ...
ST308                    0
ST327                    0
AC104                    0
GY240                    0
MG303                    0

[331 rows x 1 columns]


In [32]:
import pandas as pd

# Replace 'your_file.xlsx' with the actual path to your Excel file
df = pd.read_excel("financem.xlsx")

print(df)

   Module  Year  Optional/Compulsory Prerequisites  Units
0   MA108     1                    0             0    0.5
1   FM100     1                    0             0    0.5
2   FM102     1                    0             0    0.5
3   EC1A3     1                    0             0    0.5
4   EC1B3     1                    0             0    0.5
5   ST102     1                    0             0    1.0
6   AC102     1                    1             0    0.5
7   ST101     1                    1             0    0.5
8   FM201     2                    0             0    0.5
9   FM200     2                    0             0    0.5
10  EC2C3     2                    0             0    0.5
11  EC2C4     2                    0             0    0.5
12  EC2A3     2                    0             0    0.5
13  FM215     2                    0             0    0.5
14  FM214     2                    0             0    0.5
15  ST115     2                    1             0    0.5
16  FM301     

In [35]:
import pandas as pd
import itertools

# Define your data (corrected lengths)
data = {
    'Module': ['MA108', 'FM100', 'FM102', 'EC1A3', 'EC1B3', 'ST102', 'AC102', 'ST101',
               'FM201', 'FM200', 'EC2C3', 'EC2C4', 'EC2A3', 'FM215', 'FM214', 'ST115',
               'FM301', 'FM302', 'FM321', 'FM322', 'FM304', 'FM305', 'ST310', 'ST311',
               'AC205', 'AC206'],
    'Year': [1]*8 + [2]*8 + [3]*8 + [2, 2],
    'Optional/Compulsory': [0]*6 + [1, 1] + [0]*14 + [1, 1],
    'Prerequisites': [0]*22 + ['AC102', 'AC102'],
    'Units': [0.5]*26
}

df = pd.DataFrame(data)

# Separate compulsory and optional modules
compulsory = df[df['Optional/Compulsory'] == 0]
optional = df[df['Optional/Compulsory'] == 1]

# Get all compulsory modules
compulsory_modules = compulsory['Module'].tolist()
compulsory_units = compulsory['Units'].sum()

# Prerequisite map for optional modules
prereq_map = dict(zip(optional['Module'], optional['Prerequisites']))

# Store valid combinations
valid_combinations = []

# Generate all combinations of optional modules
for r in range(0, len(optional)+1):
    for combo in itertools.combinations(optional.itertuples(index=False), r):
        modules = [mod.Module for mod in combo]
        units = sum(mod.Units for mod in combo)

        # Check prerequisites
        valid = True
        for mod in combo:
            prereq = prereq_map[mod.Module]
            if prereq != 0 and prereq not in modules and prereq not in compulsory_modules:
                valid = False
                break

        if valid and compulsory_units + units == 12:
            full_combo = compulsory_modules + modules
            valid_combinations.append(full_combo)

# Convert to DataFrame
result_df = pd.DataFrame(valid_combinations)
print(result_df)

ValueError: All arrays must be of the same length