In [1]:
import requests
import pandas as pd
import time
import json
import importlib, wikidata_queries
importlib.reload(wikidata_queries)
from wikidata_queries import WIKIDATA_QUERIES  # erneut ausführen
import requests
import pandas as pd
from io import StringIO

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [62]:
class WikiData:
    def __init__(self, tickers):
        self.tickers = tickers
        self.url = "https://query.wikidata.org/sparql"
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "PortfolioBot/1.0 (contact: your-email@example.com)",
            "Accept": "text/csv",
        })

    def batching(self, batch_size: int = 20):
        self.tickers.sort_values("Exchange_QID", inplace=True)
        batches = []
        for qid, df in self.tickers.groupby("Exchange_QID", sort=False):
            isin_series = df["ISIN"].astype("string").str.strip()
            has_isin = isin_series.notna() & (isin_series != "")

            isin_df = df.loc[has_isin]
            tkr_df  = df.loc[~has_isin]

            for i in range(0, len(isin_df), batch_size):
                batches.append(("ISIN", qid, isin_df.iloc[i:i + batch_size]))

            for i in range(0, len(tkr_df), batch_size):
                batches.append(("Ticker", qid, tkr_df.iloc[i:i + batch_size]))
        print("Batching completed")
        return batches
    '''
    def InvokeWikiData(self, query):
        url = "https://query.wikidata.org/sparql"
        headers = {
            "User-Agent": "PortfolioBot/1.0 (contact: your-email@example.com)",
            "Accept": "text/csv"
        }
        try:
            response = requests.get(
                url,
                params={"query": query, "format": "json"},
                headers=headers,
                timeout=30
            )
            response.raise_for_status()
            return pd.read_csv(StringIO(response.text))
        except Exception as e:
            print(f"Fehler bei der Anfrage: {e}")
            return None
    '''
    def InvokeWikiData(self, query, timeout=20, max_retries=3):
        url = "https://query.wikidata.org/sparql"
        headers = {
            "User-Agent": "PortfolioBot/1.0 (contact: your-email@example.com)",
            "Accept": "text/csv",
        }

        for attempt in range(max_retries):
            try:
                time.slep(1)
                response = self.session.get(self.url, params={"query": query}, timeout=timeout)
                '''
                response = requests.get(
                    url,
                    params={"query": query},
                    headers=headers,
                    timeout=timeout
                )
                '''
                response.raise_for_status()

                ctype = response.headers.get("Content-Type", "")
                text = response.text

                # Wenn kein CSV zurückkommt, sondern HTML/Fehlertext:
                if "text/csv" not in ctype and not text.lstrip().startswith(("?", '"')):
                    # Debug-Ausgabe
                    print("Non-CSV response:", ctype)
                    print(text[:300])
                    return None

                # CSV robust lesen
                return pd.read_csv(
                    StringIO(text),
                    sep=",",            # Wikidata CSV ist i.d.R. comma
                    engine="python",    # python-engine ist toleranter als C-engine
                )
            

            except requests.exceptions.Timeout:
                print(f"Timeout (attempt {attempt+1}/{max_retries})")
                time.sleep(2 * (attempt + 1))
            except pd.errors.ParserError as e:
                print("CSV ParserError:", e)
                print("Response head:\n", response.text[:500])
                return None
            except Exception as e:
                print("Request error:", e)
                return None

        return None
            
    def WikiDataQID(self, batches):
        batch_list = []
        count = 1
        for batch in batches:
            print("Batch = ", count)
            count += 1
            if batch[0] == "ISIN":
                values_formatted = " ".join(f'"{i}"' for i in batch[2]["ISIN"])
                query = f"""
                SELECT ?reqId ?company WHERE {{
                VALUES ?reqId {{ {values_formatted} }}
                OPTIONAL {{ ?company wdt:P946 ?reqId . }}
                }}
                """
                df = self.InvokeWikiData(query)
                
                if df is None or df.empty:
                    continue

                # company URI -> QID
                if "company" in df.columns:
                    df["Company_QID"] = df["company"].astype(str).str.rsplit("/", n=1).str[-1]
                else:
                    df["Company_QID"] = None

                df.drop(columns=['company'], inplace=True, errors='ignore')
                batch_list.append(df)

            elif batch[0] == "Ticker":
                tickers_formatted = " ".join(f'"{i}"' for i in batch[2]["OriginalTicker"])
                query = f"""
                SELECT ?reqId ?company WHERE {{
                VALUES ?reqId {{ {tickers_formatted} }}
                OPTIONAL {{ ?company p:P414 [ ps:P414 wd:{batch[1]} ; pq:P249 ?reqId ] . }}
                }}
                """
                df = self.InvokeWikiData(query)
                if df is None or df.empty:
                    continue
                if "company" in df.columns:
                    df["Company_QID"] = df["company"].astype(str).str.rsplit("/", n=1).str[-1]
                else:
                    df["Company_QID"] = None
                
                df.drop(columns=['company'], inplace=True, errors='ignore')
                batch_list.append(df)
        print("Fetching Company_QID completed")
        return batch_list
        
    def FetchInfo(self, batch: pd.DataFrame):
        results_per_query = []
        batch = batch[batch["Company_QID"] != 'nan']
        qids = " ".join(f"wd:{q}" for q in batch["Company_QID"])
        for query_name, template in WIKIDATA_QUERIES.items():
            print("QUERYNAME = ", query_name)
            query = template.replace("__VALUES__", qids)  # <-- wichtig
            df = self.InvokeWikiData(query)
            if df is None or df.empty:
                    continue
            if "company" in df.columns:
                df["Company_QID"] = df["company"].astype(str).str.rsplit("/", n=1).str[-1]
            else:
                df["Company_QID"] = None
            df.drop(columns=['company'], inplace=True, errors='ignore')
            results_per_query.append(df)
        return results_per_query
    
    def merge_querys(self, list_of_query: pd.DataFrame):
        list_to_concat = []
        for batch in list_of_query:
            for query in batch:
                oldname = query.columns[0]
                query = query.rename(columns={oldname: "Value"})
                query['Item_Description'] = oldname
                query = query[['Company_QID', 'Item_Description', 'Value']]
                list_to_concat.append(query)
        return pd.concat(list_to_concat, ignore_index=True)  
        
    def run(self):
        list_of_results = []
        batches = self.batching()
        batches = batches
        company_qid = self.WikiDataQID(batches)
        for batch in company_qid:
            result = self.FetchInfo(batch)
            list_of_results.append(result)
        return self.merge_querys(list_of_results)
            
        
test = pd.read_csv("C:\\Diversification\\data\\tickers.csv")
WikiData(test).run()

Batching completed
Batch =  1
Batch =  2
Batch =  3
Batch =  4
Batch =  5
Batch =  6
Batch =  7
Batch =  8
Batch =  9
Batch =  10
Batch =  11
Batch =  12
Batch =  13
Batch =  14
Batch =  15
Batch =  16
Batch =  17
Batch =  18
Batch =  19
Batch =  20
Batch =  21
Batch =  22
Batch =  23
Batch =  24
Batch =  25
Batch =  26
Batch =  27
Batch =  28
Batch =  29
Batch =  30
Batch =  31
Batch =  32
Batch =  33
Batch =  34
Batch =  35
Timeout (attempt 1/3)
Batch =  36
Batch =  37
Batch =  38
Batch =  39
Batch =  40
Batch =  41
Batch =  42
Batch =  43
Batch =  44
Batch =  45
Batch =  46
Batch =  47
Timeout (attempt 1/3)
Timeout (attempt 2/3)
Batch =  48
Batch =  49
Batch =  50
Batch =  51
Batch =  52
Batch =  53
Batch =  54
Batch =  55
Batch =  56
Batch =  57
Batch =  58
Batch =  59
Batch =  60
Batch =  61
Batch =  62
Batch =  63
Batch =  64
Batch =  65
Batch =  66
Batch =  67
Batch =  68
Batch =  69
Batch =  70
Batch =  71
Batch =  72
Batch =  73
Batch =  74
Batch =  75
Batch =  76
Batch =  77


Unnamed: 0,Company_QID,Item_Description,Value
0,Q26898059,isin,FR0000066441
1,Q26898059,founding_year,1900-01-01T00:00:00Z
2,Q3483136,isin,FR0010202606
3,Q130387411,isin,FR0012819381
4,Q131651749,isin,FR001400SVN0
...,...,...,...
5029,Q115167588,isin,IT0005395071
5030,Q115167588,isin,IT0005395089
5031,Q779182,founding_year,1961-01-01T00:00:00Z
5032,Q110269968,founding_year,2021-06-16T00:00:00Z


In [None]:
class WikiDataClient:
    def batching(self, tickers, batch_size: int = 100):
        tickers.sort_values("Exchange_QID", inplace=True)
        batches = []
        for qid, df in tickers.groupby("Exchange_QID", sort=False):
            isin_series = df["ISIN"].astype("string").str.strip()
            has_isin = isin_series.notna() & (isin_series != "")

            isin_df = df.loc[has_isin]
            tkr_df  = df.loc[~has_isin]

            for i in range(0, len(isin_df), batch_size):
                batches.append(("ISIN", qid, isin_df.iloc[i:i + batch_size]))

            for i in range(0, len(tkr_df), batch_size):
                batches.append(("Ticker", qid, tkr_df.iloc[i:i + batch_size]))

        return batches 
    
    def QIDBatching(self, qids, batch_size: int = 100):
        QID_batches = []
        for i in range(0, len(qids), batch_size):
            QID_batches.append(qids[i:i + batch_size])
        return QID_batches

    def InvokeWikiData(self, query):
        url = "https://query.wikidata.org/sparql"
        headers = {
            "User-Agent": "PortfolioBot/1.0 (contact: your-email@example.com)",
            "Accept": "application/sparql-results+json"
        }

        try:
            response = requests.get(
                url,
                params={"query": query, "format": "json"},
                headers=headers,
                timeout=30
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            print(f"Fehler bei der Anfrage: {e}")
            return None

    def WikiDataQID(self, batches, url='https://query.wikidata.org/sparql'):
        list_of_dicts = []
        for batch in batches:
            if batch[0] == "ISIN":
                values_formatted = ' '.join([f'"{i}"' for i in batch[2]['ISIN']])
                query = f"""SELECT ?reqId ?company WHERE {{ VALUES ?reqId {{ {values_formatted} }} OPTIONAL {{ ?company wdt:P946 ?reqId . }} }}"""
                data = self.InvokeWikiData(query)         
                results = data['results']['bindings']
                for result in results:
                    reqId = result['reqId']['value']
                    company = result.get('company', {}).get('value', None)
                    if company is not None and company != "":
                        company_qid = company.split('/')[-1]
                    list_of_dicts.append({"RequestID": reqId, "Company_QID": company_qid if company is not None else None})

                
            elif batch[0] == 'Ticker':
                tickers_formatted = ' '.join([f'"{i}"' for i in batch[2]['OriginalTicker']])
                query = f"""
                SELECT ?reqId ?company ?companyLabel
                WHERE {{
                VALUES ?reqId {{ {tickers_formatted} }}

                OPTIONAL {{
                    ?company p:P414 [
                    ps:P414 wd:{batch[1]} ;
                    pq:P249 ?reqId
                    ] .
                }}

                SERVICE wikibase:label {{
                    bd:serviceParam wikibase:language "en".
                }}
                }}
                """                    

                data = self.InvokeWikiData(query)         
                results = data['results']['bindings']
                for result in results:
                    reqId = result['reqId']['value']
                    company = result.get('company', {}).get('value', None)
                    if company is not None and company != "":
                        company_qid = company.split('/')[-1]
                    list_of_dicts.append({"RequestID": reqId, "Company_QID": company_qid if company is not None else None})
        
        return list_of_dicts
    
    def WikiDataSubsidiaries(self, company_qids_batches):
        list_of_dicts = []
        for batch in company_qids_batches:
            qids_formatted = ' '.join([f'wd:{qid}' for qid in batch])
            query = f"""
            SELECT ?parentCompany ?subsidiaryCompany
            WHERE {{
                VALUES ?parentCompany {{ {qids_formatted} }}
                ?subsidiaryCompany wdt:P355 ?parentCompany .
            }}
            """
            data = self.InvokeWikiData(query)         
            results = data['results']['bindings']
            for result in results:
                parent_company = result['parentCompany']['value']
                subsidiary_company = result['subsidiaryCompany']['value']
                parent_qid = parent_company.split('/')[-1]
                subsidiary_qid = subsidiary_company.split('/')[-1]
                list_of_dicts.append({"Parent_Company_QID": parent_qid, "Subsidiary_Company_QID": subsidiary_qid})
        return list_of_dicts


    


In [61]:
test = pd.read_csv("C:\\Diversification\\data\\tickers.csv")
batches = WikiDataClient().batching(test)
LoD = pd.DataFrame(WikiDataClient().WikiDataQID(batches))
subsidiaries = WikiDataClient().WikiDataSubsidiaries(WikiDataClient().QIDBatching(LoD['Company_QID']))
subsidiaries

[{'Parent_Company_QID': 'Q105508686', 'Subsidiary_Company_QID': 'Q3117424'},
 {'Parent_Company_QID': 'Q309245', 'Subsidiary_Company_QID': 'Q152051'},
 {'Parent_Company_QID': 'Q309245', 'Subsidiary_Company_QID': 'Q156152'},
 {'Parent_Company_QID': 'Q917050', 'Subsidiary_Company_QID': 'Q1431486'},
 {'Parent_Company_QID': 'Q1142890', 'Subsidiary_Company_QID': 'Q107203004'},
 {'Parent_Company_QID': 'Q11960931', 'Subsidiary_Company_QID': 'Q7703031'},
 {'Parent_Company_QID': 'Q24038997', 'Subsidiary_Company_QID': 'Q1431486'},
 {'Parent_Company_QID': 'Q26311', 'Subsidiary_Company_QID': 'Q118368420'},
 {'Parent_Company_QID': 'Q818846', 'Subsidiary_Company_QID': 'Q17513197'},
 {'Parent_Company_QID': 'Q822614', 'Subsidiary_Company_QID': 'Q4033665'},
 {'Parent_Company_QID': 'Q902769', 'Subsidiary_Company_QID': 'Q1512800'},
 {'Parent_Company_QID': 'Q972966', 'Subsidiary_Company_QID': 'Q683064'},
 {'Parent_Company_QID': 'Q972966', 'Subsidiary_Company_QID': 'Q1471359'},
 {'Parent_Company_QID': 'Q132

In [48]:
test = pd.read_csv("C:\\Diversification\\data\\tickers.csv")
batches = WikiDataClient().batching(test)
print(f"Anzahl der Batches: {len(batches)}")
list_of_dfs = []
for batch in batches:
    print(f"Verarbeite Batch mit Kriterium {batch[0]} und QID {batch[1]} (Anzahl Einträge: {len(batch[2])})")
    data = WikiDataClient().WikiDataQID(batch)
    df = pd.DataFrame(data)
    list_of_dfs.append(df)
final_df = pd.concat(list_of_dfs, ignore_index=True).drop_duplicates(subset="RequestID")
NoneCount = final_df['Company_QID'].isna().sum()
QIDCount = final_df['Company_QID'].notna().sum()
print(f"Anzahl der Einträge mit Company_QID: {QIDCount} Anzahl der Einträge ohne Company_QID: {NoneCount} Anteil der Einträge mit Company_QID: {QIDCount / len(final_df) * 100:.2f}%")

Anzahl der Batches: 75
Verarbeite Batch mit Kriterium ISIN und QID Q107188657 (Anzahl Einträge: 100)


KeyError: 0

In [14]:
def fetch_wikidata_refined(identifiers, exchange_qid, mode="isin"):
    url = 'https://query.wikidata.org/sparql'
    
    if mode == "isin":
        values_formatted = ' '.join([f'"{i}"' for i in identifiers])
        lookup_logic = "?company wdt:P946 ?reqId . BIND(?reqId AS ?isin)"
    else:
        values_formatted = ' '.join([f'"{t}"' for t in identifiers])
        lookup_logic = f"?company p:P414 [ ps:P414 wd:{exchange_qid} ; pq:P249 ?reqId ] . OPTIONAL {{ ?company wdt:P946 ?isin . }}"

    query = f"""
    SELECT ?reqId ?company ?companyLabel ?isin ?foundingYear
      (GROUP_CONCAT(DISTINCT ?subLabel; separator="|") AS ?subsidiaries)
      (GROUP_CONCAT(DISTINCT ?subID; separator="|") AS ?subsidiary_IDs)
      (GROUP_CONCAT(DISTINCT ?indLabel; separator="|") AS ?industries)
      (GROUP_CONCAT(DISTINCT ?indID; separator="|") AS ?industry_IDs)
      (GROUP_CONCAT(DISTINCT ?compLabel; separator="|") AS ?competitors)
      (GROUP_CONCAT(DISTINCT ?compID; separator="|") AS ?competitor_IDs)
      (GROUP_CONCAT(DISTINCT ?invLabel; separator="|") AS ?investments)
      (GROUP_CONCAT(DISTINCT ?invID; separator="|") AS ?investment_IDs)
      (GROUP_CONCAT(DISTINCT ?prodLabel; separator="|") AS ?products)
      (GROUP_CONCAT(DISTINCT ?prodID; separator="|") AS ?product_IDs)
      (GROUP_CONCAT(DISTINCT ?areaLabel; separator="|") AS ?operating_areas)
      (GROUP_CONCAT(DISTINCT ?areaID; separator="|") AS ?operating_area_IDs)
      (GROUP_CONCAT(DISTINCT ?locLabel; separator="|") AS ?locations)
      (GROUP_CONCAT(DISTINCT ?locID; separator="|") AS ?location_IDs)
      (GROUP_CONCAT(DISTINCT ?instLabel; separator="|") AS ?instances)
      (GROUP_CONCAT(DISTINCT ?instID; separator="|") AS ?instance_IDs)
      (GROUP_CONCAT(DISTINCT ?partLabel; separator="|") AS ?parts_of)
      (GROUP_CONCAT(DISTINCT ?partID; separator="|") AS ?part_of_IDs)
      (GROUP_CONCAT(DISTINCT ?ownerLabel; separator="|") AS ?owners)
      (GROUP_CONCAT(DISTINCT ?ownerID; separator="|") AS ?owned_by_IDs)
      
    WHERE {{
      VALUES ?reqId {{ {values_formatted} }}
      {lookup_logic}
      
      OPTIONAL {{ ?company wdt:P571 ?fDate . BIND(YEAR(?fDate) AS ?foundingYear) }}
      
      # Tochtergesellschaften (P355) & Unternehmensbeteiligungen (P1830)
      OPTIONAL {{ 
        ?company (wdt:P355) ?sub . ?sub rdfs:label ?subLabel . FILTER(LANG(?subLabel) = "en") 
        BIND(REPLACE(STR(?sub), ".*Q", "Q") AS ?subID)
      }}
      # Beteiligungen (Investments)
      OPTIONAL {{ 
        ?company (wdt:P1830) ?inv . ?inv rdfs:label ?invLabel . FILTER(LANG(?invLabel) = "en") 
        BIND(REPLACE(STR(?inv), ".*Q", "Q") AS ?invID)
      }}
      # Industrien (P452)
      OPTIONAL {{ 
        ?company wdt:P452 ?ind . ?ind rdfs:label ?indLabel . FILTER(LANG(?indLabel) = "en") 
        BIND(REPLACE(STR(?ind), ".*Q", "Q") AS ?indID)
      }}
      # Produkte / Material Produced (P1056)
      OPTIONAL {{ 
        ?company wdt:P1056 ?prod . ?prod rdfs:label ?prodLabel . FILTER(LANG(?prodLabel) = "en") 
        BIND(REPLACE(STR(?prod), ".*Q", "Q") AS ?prodID)
      }}
      # Operating Area (P2541)
      OPTIONAL {{ 
        ?company wdt:P2541 ?area . ?area rdfs:label ?areaLabel . FILTER(LANG(?areaLabel) = "en") 
        BIND(REPLACE(STR(?area), ".*Q", "Q") AS ?areaID)
      }}
      # Location (P276)
      OPTIONAL {{ 
        ?company wdt:P276 ?loc . ?loc rdfs:label ?locLabel . FILTER(LANG(?locLabel) = "en") 
        BIND(REPLACE(STR(?loc), ".*Q", "Q") AS ?locID)
      }}
      # Instance of (P31)
      OPTIONAL {{ 
        ?company wdt:P31 ?inst . ?inst rdfs:label ?instLabel . FILTER(LANG(?instLabel) = "en") 
        BIND(REPLACE(STR(?inst), ".*Q", "Q") AS ?instID) 
      }}
      # Part of (P361)
      OPTIONAL {{ 
        ?company wdt:P361 ?part . ?part rdfs:label ?partLabel . FILTER(LANG(?partLabel) = "en") 
        BIND(REPLACE(STR(?part), ".*Q", "Q") AS ?partID)
      }}
      # Owned by (P127)
      OPTIONAL {{ 
        ?company wdt:P127 ?owner . ?owner rdfs:label ?ownerLabel . FILTER(LANG(?ownerLabel) = "en") 
        BIND(REPLACE(STR(?owner), ".*Q", "Q") AS ?ownerID)
      }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    GROUP BY ?reqId ?company ?companyLabel ?isin ?foundingYear
    """
    
    headers = {'User-Agent': 'PortfolioBot/1.0', 'Accept': 'application/sparql-results+json'}
    try:
        response = requests.post(url, data={'query': query, 'format': 'json'}, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        parsed_results = []
        for item in data['results']['bindings']:
            parsed_results.append({
                'requested_id': item.get('reqId', {}).get('value'),
                'company_qid': item.get('company', {}).get('value').split('/')[-1],
                'name': item.get('companyLabel', {}).get('value'),
                'isin': item.get('isin', {}).get('value'),
                'founding_year': item.get('foundingYear', {}).get('value'),
                'industries': item.get('industries', {}).get('value'),
                'industry_qids': item.get('industry_IDs', {}).get('value'),
                'subsidiaries': item.get('subsidiaries', {}).get('value'),
                'subsidiary_qids': item.get('subsidiary_IDs', {}).get('value'),
                'investments': item.get('investments', {}).get('value'),
                'investment_qids': item.get('investment_IDs', {}).get('value'),
                'products': item.get('products', {}).get('value'),
                'product_qids': item.get('product_IDs', {}).get('value'),
                'operating_areas': item.get('operating_areas', {}).get('value'),
                'operating_area_qids': item.get('operating_area_IDs', {}).get('value'),
                'locations': item.get('locations', {}).get('value'),
                'location_qids': item.get('location_IDs', {}).get('value'),
                'instances': item.get('instances', {}).get('value'),
                'instance_qids': item.get('instance_IDs', {}).get('value'),
                'parts_of': item.get('parts_of', {}).get('value'),
                'part_of_qids': item.get('part_of_IDs', {}).get('value'),
                'owners': item.get('owners', {}).get('value'),
                'owner_qids': item.get('owned_by_IDs', {}).get('value')
            })
        return parsed_results
    except Exception as e:
        print(f"Fehler im Modus {mode}: {e}")
        return []

all_data = []
BATCH_SIZE = 5 
input_df = pd.read_csv("C:\\Diversification\\data\\tickers.csv")
failed_isins = []
failed_ticker = []
for qid, group in input_df.groupby('Exchange_QID'):
    # Gruppe A: Alles mit ISIN
    with_isin = group[group['ISIN'].notna() & (group['ISIN'] != "")]
    # Gruppe B: Alles ohne ISIN (nur Ticker)
    only_ticker = group[group['ISIN'].isna() | (group['ISIN'] == "")]

    print(f"\nBörse {qid}: {len(with_isin)} mit ISIN, {len(only_ticker)} nur mit Ticker")

    # 1. Abfrage über ISIN (Sehr stabil)
    isins = with_isin['ISIN'].tolist()
    for i in range(0, len(isins), BATCH_SIZE):
        batch = isins[i:i + BATCH_SIZE]
        new_data = fetch_wikidata_refined(batch, qid, mode="isin")
        if len(new_data) != len(batch):
            print(f"  -> Warnung: {len(batch)} ISINs angefragt, aber nur {len(new_data)} Ergebnisse erhalten!") 
            failed_isins.extend(batch)  # Diese ISINs könnten problematisch sein
        all_data.extend(new_data)
        time.sleep(1)

    # 2. Abfrage über Ticker (Fallstrick für US-Werte)
    tickers = only_ticker['OriginalTicker'].tolist()
    for i in range(0, len(tickers), BATCH_SIZE):
        batch = tickers[i:i + BATCH_SIZE]
        # Hier bei US-Börsen eventuell Batchsize noch kleiner (20)
        new_data = fetch_wikidata_refined(batch, qid, mode="ticker")
        if len(new_data) != len(batch):
            print(f"  -> Warnung: {len(batch)} Ticker angefragt, aber nur {len(new_data)} Ergebnisse erhalten!")
            failed_ticker.extend(batch)  # Diese Ticker könnten problem

        all_data.extend(new_data)
        time.sleep(1)

df_final = pd.DataFrame(all_data)



Börse Q107188657: 251 mit ISIN, 0 nur mit Ticker
  -> Warnung: 5 ISINs angefragt, aber nur 1 Ergebnisse erhalten!
  -> Warnung: 5 ISINs angefragt, aber nur 0 Ergebnisse erhalten!
  -> Warnung: 5 ISINs angefragt, aber nur 0 Ergebnisse erhalten!
  -> Warnung: 5 ISINs angefragt, aber nur 1 Ergebnisse erhalten!
  -> Warnung: 5 ISINs angefragt, aber nur 0 Ergebnisse erhalten!
  -> Warnung: 5 ISINs angefragt, aber nur 0 Ergebnisse erhalten!


KeyboardInterrupt: 

In [5]:
df_final

Unnamed: 0,requested_id,company_qid,name,isin,founding_year,industries,industry_qids,subsidiaries,subsidiary_qids,investments,...,operating_areas,operating_area_qids,locations,location_qids,instances,instance_qids,parts_of,part_of_qids,owners,owner_qids
0,FR0013341781,Q79189910,2CRSI Group,FR0013341781,2005,business and other management consulting,Q112166041,,,,...,,,,,organization,Q43229,,,,
1,FR0010478248,Q753684,Atari SA,FR0010478248,1983,publishing of application software,Q112165934,"Atari, Inc.|Atari Interactive",Q527336|Q15018011,"Atari, Inc.",...,,,,,video game developer|organization|video game p...,Q210167|Q43229|Q1137109,,,,
2,FR0000054421,Q98931288,Bourrelier Group,FR0000054421,1975,,,Mavic,Q573356,,...,,,,,organization,Q43229,,,,
3,FR0011648716,Q65158292,Carbios,FR0011648716,2011,research and development in biotechnology,Q112166054,,,,...,,,,,organization,Q43229,,,,
4,FR0000053506,Q1052675,Cegedim,FR0000053506,1969,"data processing, hosting and related activities",Q112165979,,,,...,,,,,organization,Q43229,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2493,IT0003865570,Q21693482,Webuild,IT0003865570,1959,construction industry,Q13405640,Clough Limited|Lane Industries,Q5135801|Q124833043,,...,,,,,public company|business,Q891723|Q4830453,,,Intesa Sanpaolo|CDP Equity|Salini,Q1343118|Q3747273|Q124654854
2494,IT0004810054,Q1503818,Unipol Gruppo,IT0004810054,1961,financial services,Q837171,UNA Hotels & Resorts|Unipol Assicurazioni|Unip...,Q7882190|Q15734697|Q115742974,,...,,,,,business|enterprise,Q4830453|Q6881511,,,,
2495,IT0005573065,Q132830159,Unidata,IT0005573065,1985,telecommunications,Q2401742,,,,...,,,,,telecommunication company|business|enterprise,Q2401749|Q4830453|Q6881511,,,,
2496,IT0005013013,Q93097534,Q93097534,IT0005013013,2013,,,,,,...,,,,,enterprise|business,Q6881511|Q4830453,,,,


In [None]:
df_final.isna().sum()

In [32]:
import re

def analyze_reach(df_results, input_df):
    # 1. Funktion zur Identifizierung des Typs
    # Eine ISIN hat fast immer 12 Zeichen und beginnt mit zwei Buchstaben
    def detect_type(val):
        if pd.isna(val) or val == "": return "Unbekannt"
        return "ISIN" if re.match(r"^[A-Z]{2}[A-Z0-9]{10}$", str(val)) else "Ticker"

    # 2. Den Such-Typ in den Ergebnissen bestimmen
    # Wir schauen, was in der Spalte 'requested_id' (aus der fetch_refined Funktion) steht
    df_results = df_results.copy()
    df_results['Found_Via'] = df_results['requested_id'].apply(detect_type)

    # 3. Zählung der Treffer
    stats = df_results['Found_Via'].value_counts()
    
    total_isin_input = len(input_df[input_df['ISIN'].notna() & (input_df['ISIN'] != "")])
    total_ticker_input = len(input_df) - total_isin_input

    # 4. Übersicht ausgeben
    print("--- WIKIDATA TREFFER-ANALYSE ---")
    print(f"Treffer über ISIN:   {stats.get('ISIN', 0)} / {total_isin_input} " 
          f"({(stats.get('ISIN', 0)/total_isin_input)*100:.1f}% Abdeckung)")
    print(f"Treffer über Ticker: {stats.get('Ticker', 0)} / {total_ticker_input} "
          f"({(stats.get('Ticker', 0)/total_ticker_input)*100:.1f}% Abdeckung)")
    print(f"Gesamt-Treffer:      {len(df_results)} / {len(input_df)}")
    
    return df_results

# Anwendung
df_final_analyzed = analyze_reach(df_final, input_df)
df_final_analyzed

--- WIKIDATA TREFFER-ANALYSE ---
Treffer über ISIN:   592 / 1724 (34.3% Abdeckung)
Treffer über Ticker: 1975 / 4133 (47.8% Abdeckung)
Gesamt-Treffer:      2567 / 5857


Unnamed: 0,requested_id,company_id,name,isin,industries,Found_Via
0,FR0010425595,Q2943995,Cellectis,FR0010425595,research and development in biotechnology,ISIN
1,FR0011648716,Q65158292,Carbios,FR0011648716,research and development in biotechnology,ISIN
2,FR0010478248,Q753684,Atari SA,FR0010478248,publishing of application software,ISIN
3,FR0000053506,Q1052675,Cegedim,FR0000053506,"data processing, hosting and related activities",ISIN
4,FR0010490920,Q1375196,EuropaCorp,FR0010490920,film production for the cinema|film industry,ISIN
5,FR0013341781,Q79189910,2CRSI Group,FR0013341781,business and other management consulting,ISIN
6,FR0000060840,Q98778444,Devernois,FR0000060840,manufacture of outerwear|textile industry,ISIN
7,FR0000060840,Q98778444,Devernois,FR0000060840,manufacture of outerwear|textile industry,ISIN
8,FR0012819381,Q130387411,Guillin Group,FR0012819381,activities of head offices,ISIN
9,FR001400SVN0,Q131651749,Drone Volt,FR001400SVN0,aircraft and space construction,ISIN
