In [28]:
import pandas as pd
from bs4 import BeautifulSoup  
import numpy as np
from datetime import datetime, timedelta
import requests
import glob
import json 

In [29]:
def parsetable(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wilko'})
    rows = table.find_all('tr')
    columnnames=[]
    array=[]
    for row in rows:
        columns = row.find_all('th')
        if columns:
            columnnames=[column.text.strip() for column in columns]
        
        cells = row.find_all('td')
        if cells:
            array.append([cell.text.strip() for cell in cells])
            
    df_raw=pd.DataFrame(columns=columnnames,data=array)
    return df_raw

In [30]:
# have to define the types for the variables/columns
def table_cleaner(df_raw):
    #we throw away entries with unclear source
    df_raw=df_raw.dropna(subset=["Institut"])
    #and we have to split the column "Sonstige" because sometimes it contains
    #entries for multiple parties
    splitted_sonstige=df_raw["Sonstige"].astype(str).str.split("%")
    
    #and we want to sort it into a dict later so we will retrieve    
    #the keys for these multiple entries first
    array_of_keys=["Sonstige_no_differentiation"]
    for row in range(len(splitted_sonstige)):
        for char in splitted_sonstige.iloc[row]:
            party_sonstige_list_results=char.split()
            if (len(party_sonstige_list_results)>1):
                array_of_keys.append(char.split()[0])
           
    #now we create a default dictionary that will be used in each row
    default_dict=dict((k, np.nan) for k in array_of_keys)
    #and will be pushed into an array unless
    array_sonstige=[]

    for row in range(len(splitted_sonstige)):
        row_dict=default_dict.copy()
        for char in splitted_sonstige.iloc[row]:
            party_sonstige_list_results=char.split()
            
            #we have entries in a cell that differentiate the sonstige 
            if (len(party_sonstige_list_results)>1):
                row_dict[party_sonstige_list_results[0]]=party_sonstige_list_results[1]
            #or we have at least one number in the cell, this is then added as "Sonstige_no_differentiation"
            if (len(party_sonstige_list_results)==1):
                row_dict["Sonstige_no_differentiation"]=party_sonstige_list_results[0]
        #we keep on appending these dictionaries into the array        
        array_sonstige.append(row_dict)
        
    #and then create a dataframe
    sonstige_df=pd.DataFrame(array_sonstige)

    #that is then added to the original dataframe
    df_raw_sonstige=pd.concat([df_raw,sonstige_df],axis=1)

    
    df_polls=df_raw_sonstige[~df_raw_sonstige.Institut.astype(str).str.contains("Landtag")]
    df_polls[df_polls.columns[4:]]=df_polls[df_polls.columns[4:]].replace({'%': ''}, regex=True)
    df_polls[df_polls.columns[4:]]=df_polls[df_polls.columns[4:]].replace({',': '.'}, regex=True)
    
    df_polls=df_polls.drop(["Sonstige"],axis=1)
    SonstigeColumns=df_polls.columns[df_polls.columns.astype(str).str.contains("Son")]
    df_polls[SonstigeColumns]=df_polls[SonstigeColumns].apply(pd.to_numeric, errors='coerce')
    sonstige_summed=df_polls[SonstigeColumns].sum(axis=1)
    df_polls_cleaned=df_polls.drop(SonstigeColumns,axis=1)
    df_polls_cleaned["Sonstige"]=sonstige_summed

    df_polls_cleaned=df_polls_cleaned.dropna(subset=["Befragte"]).reset_index(drop=True)
    befragte_liste=[]
    for row in df_polls_cleaned["Befragte"].astype(str).str.split("•"):
        befragte=(row[1].strip()[:5])
        if "." in befragte:
            befragte=befragte.replace(".","")
        else:
            befragte=befragte[:4]
        befragte_number=int(befragte)
        befragte_liste.append(befragte_number)
    befragte_serie=pd.Series(befragte_liste)
    df_polls_cleaned["Befragte_neu"]=befragte_serie
    

    return df_polls_cleaned

In [31]:
def dirichet_transform(df_polls_cleaned):
    df_polls_cleaned[df_polls_cleaned.columns[4:]] = df_polls_cleaned[df_polls_cleaned.columns[4:]].apply(pd.to_numeric, errors='coerce')
    df_polls_cleaned[df_polls_cleaned.columns[4:]]=df_polls_cleaned[df_polls_cleaned.columns[4:]].astype(float)
    df_polls_cleaned["Datum"]=pd.to_datetime(df_polls_cleaned["Datum"], format="%d.%m.%Y")
    timeperiod_of_interest=last_60_days=df_polls_cleaned.iloc[0]["Datum"]- timedelta(days=60)
    results_of_interest=df_polls_cleaned[df_polls_cleaned["Datum"]>timeperiod_of_interest]
    results_of_interest=results_of_interest.dropna(axis=1, how='all')
    
    results_of_interest_no_zeros=results_of_interest.loc[:, ~results_of_interest.isin([0,np.nan]).all(0)]
    only_party_results=(results_of_interest_no_zeros[results_of_interest_no_zeros.columns[4:-1]])/100
    only_party_results_respondents=(only_party_results.T*(results_of_interest_no_zeros["Befragte_neu"].values)).T
    mean_results=only_party_results_respondents.mean()*len(only_party_results_respondents)
    column_names=mean_results.index.tolist()
    values_array=mean_results.values.tolist()
    s = np.random.dirichlet((values_array), size = 100) 
    results_dirichlet=pd.DataFrame(s*100,columns=column_names)
    results_dirichlet=results_dirichlet.round(1)
    results_dirichlet["Sonstige"]=results_dirichlet["Sonstige"]+(100-results_dirichlet.sum(axis=1))
    return results_dirichlet

In [32]:
def scenario_calc(result):
    scenario_json={}
    max_column=result.idxmax(axis=1)
    winner_series=pd.Series(max_column)
    AfDWinner=winner_series.value_counts()["AfD"]-1
    scenario_json["AfDGewinnt"]=AfDWinner
    fuenfprozent_huerde=pd.DataFrame(np.where(result<5,0,result),columns=result.columns)
    fuenfprozent_huerde=fuenfprozent_huerde.div(fuenfprozent_huerde.sum(axis=1), axis=0)*100
    keineMehrheitKoalition=len(fuenfprozent_huerde[fuenfprozent_huerde[current_coalitions[bundesland]].sum(axis=1)<50])
    if (keineMehrheitKoalition==100):
        keineMehrheitKoalition=keineMehrheitKoalition-1
    if (keineMehrheitKoalition==0):
        keineMehrheitKoalition=keineMehrheitKoalition+1
    scenario_json["KoalitionEndet"]=keineMehrheitKoalition
    keineKlassischeMehrheit=len(fuenfprozent_huerde[(fuenfprozent_huerde["AfD"]+fuenfprozent_huerde["BSW"])>50])
    if (keineKlassischeMehrheit==100):
        keineKlassischeMehrheit=keineKlassischeMehrheit-1
    if (keineKlassischeMehrheit==0):
        keineKlassischeMehrheit=keineKlassischeMehrheit+1
    scenario_json["KeineKlassischeMehrheit"]=keineKlassischeMehrheit
    return scenario_json

In [33]:
current_coalitions={"sachsen":["CDU", "GRÜNE", "SPD"], "thueringen":["LINKE", "GRÜNE", "SPD"],"brandenburg":["CDU", "GRÜNE", "SPD"]}
scenarios_bundeslaender={}
results_bundeslaender={}
files_json=glob.glob("*.json")

for bundesland in current_coalitions.keys():
    bl_table=parsetable("https://www.wahlrecht.de/umfragen/landtage/"+bundesland+".htm")
    df_polls_cleaned=table_cleaner(bl_table)
    institute_name=df_polls_cleaned["Institut"].iloc[0]
    latest_date=df_polls_cleaned["Datum"].astype(str).str[:10].iloc[0]
    filename=bundesland+"_"+institute_name+"-"+latest_date
    if (filename+".json") in files_json:
        result=pd.read_json(filename+".json")
    else:
        result=dirichet_transform(df_polls_cleaned)
        result.to_json(filename+".json")
        result.to_csv(filename+".csv",index=False)
    results_bundeslaender[bundesland]=result
    scenario=scenario_calc(result)
    scenarios_bundeslaender[bundesland]=scenario
with open("outcome.json", "w") as outfile: 
    json.dump(scenarios_bundeslaender, outfile,default=str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_polls[df_polls.columns[4:]]=df_polls[df_polls.columns[4:]].replace({'%': ''}, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_polls[df_polls.columns[4:]]=df_polls[df_polls.columns[4:]].replace({',': '.'}, regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_polls[df_polls.

In [34]:
scenarios_bundeslaender

{'sachsen': {'AfDGewinnt': np.int64(50),
  'KoalitionEndet': 99,
  'KeineKlassischeMehrheit': 1},
 'thueringen': {'AfDGewinnt': np.int64(99),
  'KoalitionEndet': 99,
  'KeineKlassischeMehrheit': 99},
 'brandenburg': {'AfDGewinnt': np.int64(99),
  'KoalitionEndet': 22,
  'KeineKlassischeMehrheit': 1}}

In [35]:
for scenario in scenarios_bundeslaender["sachsen"]:
    scenario_dict={}
    print(scenario)

    for bundesland in scenarios_bundeslaender.keys():
        prob_scenario=(scenarios_bundeslaender[bundesland][scenario])
        bundesland_name=bundesland.capitalize().replace("ue","ü")
        scenario_dict[bundesland_name]=[prob_scenario]
    scenario_df=pd.DataFrame(scenario_dict).T
    print(scenario_df)
    print("")

    scenario_df.to_csv(scenario+".csv")
    

AfDGewinnt
              0
Sachsen      50
Thüringen    99
Brandenburg  99

KoalitionEndet
              0
Sachsen      99
Thüringen    99
Brandenburg  22

KeineKlassischeMehrheit
              0
Sachsen       1
Thüringen    99
Brandenburg   1



In [36]:
result

Unnamed: 0,CDU,SPD,GRÜNE,FDP,LINKE,AfD,BVB/FW,BSW,Sonstige
0,18.7,18.9,5.8,2.7,4.7,23.3,4.3,17.0,4.6
1,19.0,20.3,6.4,2.8,4.6,23.8,3.8,15.9,3.4
2,18.2,19.8,6.5,2.4,4.9,23.1,4.0,16.6,4.5
3,18.4,19.5,6.3,2.6,5.1,23.4,3.6,16.6,4.5
4,18.9,19.3,5.5,2.4,5.0,23.7,3.6,17.2,4.4
...,...,...,...,...,...,...,...,...,...
95,18.7,18.5,6.2,3.1,4.7,23.8,3.4,16.6,5.0
96,18.3,18.4,6.9,3.3,4.6,23.3,3.2,17.6,4.4
97,18.2,20.4,6.9,2.7,4.7,23.1,3.6,16.7,3.7
98,18.8,17.9,6.6,2.7,5.0,24.4,3.6,17.0,4.0


In [37]:
results_bundeslaender.keys()

dict_keys(['sachsen', 'thueringen', 'brandenburg'])

In [38]:
for bundesland in results_bundeslaender.keys():
    df=pd.DataFrame({"max":results_bundeslaender[bundesland].max(),"median":results_bundeslaender[bundesland].median(),"min":results_bundeslaender[bundesland].min()})
    missing=100-df["median"].sum()
    df["median"].loc["Sonstige"]=df["median"].loc["Sonstige"]+missing
    df.to_csv(bundesland+"_max_min.csv")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["median"].loc["Sonstige"]=df["median"].loc["Sonstige"]+missing
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update