In [10]:
import re
import pandas as pd

# Define the regex pattern to extract data
pattern = r"""
Country:\s(?P<Country>.*?)\n
University:\s(?P<University>.*?)\n
CollegeRank:\s*(?P<CollegeRank>Not globally ranked|\d+\+?|\d+-\d+)\s*(?:\([^)]*\))?\s*\n
Tuition\s\(EUR/year\):\s(?P<Tuition>[\d,]+)(?:\s*\(.*?\))?\n
Percentage\sof\sInternational\sStudents:\s(?P<PercOfIntStud>\d+%?)(?:\s*\(.*?\))?\n
Acceptance\sRate:\s(?P<AcceptanceRate>\d+%)(?:\s*\(.*?\))?\n
Average\sSafety\sIndex\s\(0-100\):\s(?P<AvgSafetyIndex>\d+)(?:\s*\(.*?\))?\n
Cost\sof\sLiving\s\(EUR/month\):\s(?P<CostOfLiving>[\d,]+)(?:\s*\(.*?\))?\n
Rent\s\(EUR/month\):\s(?P<Rent>[\d,]+)\n
Groceries\s\(EUR/month\):\s(?P<Groceries>[\d,]+)\n
Recreation\sCost\s\(EUR/month\):\s(?P<RecreationCost>[\d,]+)\n
Healthcare\sPrice\s\(EUR/year\):\s(?P<HealthcarePrice>[\d,]+)\n
Average\sMonthly\sTransportation\sCost\s\(EUR/month\):\s(?P<AvgMntTransportCost>[\d,]+)\n
Link:\s(?P<Link>.*?)\n
Majors:\s(?P<Majors>.*?)$
"""

# Preprocess numeric fields by removing commas
def preprocess_numeric_fields(row):
    numeric_fields = [
        'Tuition', 'CostOfLiving', 'Rent', 
        'Groceries', 'RecreationCost', 
        'HealthcarePrice', 'AvgMntTransportCost'
    ]
    for key in numeric_fields:
        if key in row and isinstance(row[key], str):
            row[key] = row[key].replace(",", "")
    percentage_fields = ['PercOfIntStud', 'AcceptanceRate']
    for key in percentage_fields:
        if key in row and isinstance(row[key], str):
            row[key] = int(row[key].replace('%', ''))
    
    return row

# Extract data from text
def extract_data(text):
    matches = re.finditer(pattern, text, re.VERBOSE | re.MULTILINE)
    data = []
    for match in matches:
        row = match.groupdict()
        row = preprocess_numeric_fields(row)  # Preprocess numeric fields
        # Parse Majors into separate columns with 1/0 values
        majors = {major.split(" (")[0].strip(): int(major.split(" (")[1].strip(")")) for major in row.pop("Majors").split(", ")}
        row.update(majors)
        data.append(row)
    return data

# Preprocess text to remove empty lines
def preprocess_text(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

# Input text file containing university data
input_file = 'universities.txt'

# Read and preprocess the text file
with open(input_file, 'r') as file:
    raw_text = file.read()

# Clean the text by removing empty lines
cleaned_text = preprocess_text(raw_text)

# Extract data into structured format
data = extract_data(cleaned_text)

# Create a DataFrame
df = pd.DataFrame(data)

# Save to Excel
# Please do not use Excel if you have 
output_file = 'Universities_DataFull_Gemini.xlsx'
df.to_excel(output_file, index=False)

print(f"Data successfully written to {output_file}")

Data successfully written to Universities_DataFull_Gemini.xlsx
