In [5]:
import re
import pandas as pd

# Preprocess text to remove empty lines
def preprocess_text(text):
    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

# Define a function to extract university data from text
def extract_data(text):
    pattern = r"""
    Country:\s(?P<Country>.*?)\n
    University:\s(?P<University>.*?)\n
    CollegeRank:\s(?P<CollegeRank>.*?)\n
    Tuition\s\(EUR/year\):\s(?P<Tuition>\d+)\n
    PercOfIntStud:\s(?P<PercOfIntStud>\d+%)\n
    AcceptanceRate\s\(%\):\s(?P<AcceptanceRate>\d+%)\n
    AvgSafetyIndex\s\(0-100\):\s(?P<AvgSafetyIndex>\d+)\n
    CostOfLiving\s\(EUR/month\):\s(?P<CostOfLiving>\d+)\n
    Rent\s\(EUR/month\):\s(?P<Rent>\d+)\n
    Groceries\s\(EUR/month\):\s(?P<Groceries>\d+)\n
    RecreationCost\s\(EUR/month\):\s(?P<RecreationCost>\d+)\n
    HealthcarePrice\s\(EUR/year\):\s(?P<HealthcarePrice>\d+)\n
    AvgMntTransportCost\s\(EUR/month\):\s(?P<AvgMntTransportCost>\d+)\n
    Link:\s(?P<Link>.*?)\n
    Majors:\s(?P<Majors>.*?)$
    """
    matches = re.finditer(pattern, text, re.VERBOSE | re.MULTILINE)
    data = []
    for match in matches:
        row = match.groupdict()

        row['PercOfIntStud'] = int(row['PercOfIntStud'].replace('%', ''))
        row['AcceptanceRate'] = int(row['AcceptanceRate'].replace('%', ''))
        
        # Parse Majors into separate columns with 1/0 values
        majors = {major.split(" (")[0].strip(): int(major.split(" (")[1].strip(")")) for major in row.pop("Majors").split(", ")}
        row.update(majors)
        data.append(row)
    return data

# Input text file containing university data
input_file = 'universities.txt'
coord_file = 'coord.xlsx'

# Read and preprocess the text file
with open(input_file, 'r') as file:
    raw_text = file.read()

# Clean the text by removing empty lines
cleaned_text = preprocess_text(raw_text)

# Extract data into structured format
data = extract_data(cleaned_text)

# Create a DataFrame
df = pd.DataFrame(data)
geo_df = pd.read_excel(coord_file)

merged_df = pd.merge(df, geo_df, on='University', how='left')

merged_df['Link'] = merged_df['Website']
merged_df = merged_df.drop(columns=['Website'])

# Save to Excel
output_file = 'Universities_DataCoord.xlsx'
merged_df.to_excel(output_file, index=False)

print(f"Data successfully written to {output_file}")

Data successfully written to Universities_DataCoord.xlsx
