In [None]:
import pandas as pd
import re

# Read the text file
with open("all.txt", "r", encoding="utf-8") as file:
    text = file.read()

lines = text.splitlines()

results = []

# Pattern to detect the start of a profile
profile_pattern = re.compile(r"Go to (.+?)’s profile")

# Patterns for company size and country
company_size_pattern = re.compile(r"\b(\d{1,3}(,\d{3})?\+|\d{1,3}(,\d{3})?–\d{1,3}(,\d{3})?)\b")
country_pattern = re.compile(r"\b(Czechia|Slovakia|Poland|Germany|Austria|Hungary|France|Netherlands|Belgium|Sweden|Switzerland|UK|United Kingdom|Ireland)\b", re.IGNORECASE)

i = 0
while i < len(lines):
    line = lines[i].strip()

    name_match = profile_pattern.match(line)
    if name_match:
        full_name = name_match.group(1).strip()
        name_parts = full_name.split()
        name = name_parts[0]
        surname = " ".join(name_parts[1:]) if len(name_parts) > 1 else ""

        job_title = ""
        company = ""
        company_size = ""
        country = ""

        # Try to find Job Title and Company Name (usually on the same line, separated by 2+ spaces)
        for j in range(i + 1, min(i + 10, len(lines))):
            next_line = lines[j].strip()
            if re.search(r"\s{2,}", next_line):
                parts = re.split(r"\s{2,}", next_line)
                if len(parts) >= 2:
                    job_title = parts[0].strip()
                    company = parts[1].strip()
                    break

        # Try to find Company Size (look for common patterns like "10,000+", "201–500", etc.)
        for j in range(i, min(i + 20, len(lines))):
            size_line = lines[j].strip()
            if company_size_pattern.search(size_line):
                company_size = size_line
                break

        # Try to find Country (by matching known country names)
        for j in range(i, min(i + 15, len(lines))):
            country_line = lines[j].strip()
            if country_pattern.search(country_line):
                country = country_line
                break

        results.append({
            "Name": name,
            "Surname": surname,
            "Email Address": "",
            "Job Title": job_title,
            "Company Name": company,
            "Company Size": company_size,
            "Country": country
        })

    i += 1

# Save to Excel
df = pd.DataFrame(results)
df.to_excel("linkedin_extracted.xlsx", index=False)

print("✅ Done! Saved as linkedin_extracted.xlsx")
