# Ladda ner, läs in data, tvätta och spara ner
Detta skript laddar ner rätt filer. Läser in och tvättar dem. Sparar i lämpligt format. Med lite tur behöver man bara köra skriptet en gång ;)

In [1]:
import pandas as pd
from pathlib import Path
import re
import pandas as pd
from tqdm import tqdm
import requests, zipfile, io
from tqdm.notebook import tqdm
tqdm.pandas()

PATH_TO_RAW=Path('raw')
PATH_TO_OUTDATA=Path('data')
PATH_TO_PROP=Path('raw') / 'propositioner'
PATH_TO_SOU=Path('raw') / 'sou'
PATH_TO_META=Path('meta')
PATH_TO_MODELS=Path('models')

# Filter out documents containing the pattern:
FILTER_TITLE='vårproposition|budgetproposition|ändringsbudget' # leave empty '' if no filtering


## Define functions we will be using

The fuctions are mostly without any arguments. Most paths and strings are set in the first cell.

In [2]:

def downoad_download_meta(PATH_TO_FILES=PATH_TO_META):

    """This is a stand-alone step which is only used if I ONLY want to focus on meta data! """

    # Compile lists
    prop_list=[]
    for x in range(1998,2011,4):
        prop_list.append(f"https://data.riksdagen.se/dataset/dokument/prop-{x}-{x+3}.csv.zip")

    sou_list=[]
    for x in ['2020-','2015-','2010-2014','2005-2009','2000-2004']:
        sou_list.append(f'https://data.riksdagen.se/dataset/dokument/sou-{x}.csv.zip')


    # fetch data
    for url in prop_list+sou_list:  
        filename=Path(url).stem  
        print('Downloading',filename)
        df=pd.read_csv(url,header=None)
        df.to_csv(PATH_TO_FILES / filename)
        print('Done!')


def read_meta_to_df(PATH_TO_META=PATH_TO_META,FILTER_TITLE=''):

    """Concatenates all csv-files in PATH_TO_META into a common df. FILTER_TITLE is optional.
    returns: df
    """
        
    df=pd.concat([pd.read_csv(p, usecols=range(1,18)) for p in PATH_TO_META.glob('*.csv')])
    colnames=['hangar_id', 'dok_id', 'rm', 'beteckning', 'doktyp', 'typ', 'subtyp', 'tempbeteckning', 'organ', 'mottagare', 'nummer', 'datum', 'systemdatum', 'titel', 'subtitel', 'status', 'relaterat_id']
    df.columns=colnames
    df.set_index('hangar_id', inplace=True)
    print('Raw input shape', df.shape)

    # Filter out
    if FILTER_TITLE!='':    
        df=df.loc[~df['titel'].str.contains(FILTER_TITLE,flags=re.I),:]
        print('Output shape:',df.shape)
    else:
        print('No filters where used.')

    print('Column names:', colnames)
    return df

def downoad_starter_pack():
    """Downloads the document types prop (government bills) and SOU (Government Offical Reports)
     to PATH_TO_PROP and PATH_TO_SOU"""
    
    # Compile lists
    # Propositioner
    prop_csv, prop_txt=[], []
    for x in range(2014,2022,4):
        prop_csv.append(f"https://data.riksdagen.se/dataset/dokument/prop-{x}-{x+3}.csv.zip")
        prop_txt.append(f"https://data.riksdagen.se/dataset/dokument/prop-{x}-{x+3}.text.zip")

    # SOU
    sou_csv, sou_txt=[], []
    for x in ['2020-','2015-']:
        sou_csv.append(f"https://data.riksdagen.se/dataset/dokument/sou-{x}.csv.zip")
        sou_txt.append(f"https://data.riksdagen.se/dataset/dokument/sou-{x}.text.zip")

    # Metadata. I use pandas to do this (convinient...)
    for url in prop_csv:    
        df=pd.read_csv(url,header=None)
        df.to_csv(PATH_TO_PROP / Path(url).stem)

    for url in sou_csv:    
        df=pd.read_csv(url,header=None)
        df.to_csv(PATH_TO_SOU / Path(url).stem)

    # text documents 
    for url in prop_txt:    
        r = requests.get(url)
        print('Extracting txt-files...')
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(PATH_TO_PROP)

    for url in sou_txt:    
        r = requests.get(url)
        print('Extracting txt-files...')
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(PATH_TO_SOU)



def assert_folders():
    """Creates nessecary folders (if they don't allready exist)."""
    Path(PATH_TO_RAW).mkdir(exist_ok=True)
    Path(PATH_TO_OUTDATA).mkdir(exist_ok=True)
    Path(PATH_TO_META).mkdir(exist_ok=True)
    Path(PATH_TO_MODELS).mkdir(exist_ok=True)
    Path(PATH_TO_PROP).mkdir(parents=True,exist_ok=True)
    Path(PATH_TO_SOU).mkdir(parents=True,exist_ok=True)

def read_to_df(FILTER_TITLE=''):
    """Creates the joint df from all csv-files
    columns that are kept: 'Titel','rm','doktyp','Departement','Utskott' """

    KEEP_COL=['Titel','rm','doktyp','Departement','Utskott']
    
    ############### Propositioner #########################
    # Read metadata
    df1=pd.read_csv(PATH_TO_PROP / 'prop-2018-2021.csv')
    df2=pd.read_csv(PATH_TO_PROP /'prop-2014-2017.csv')
    df_prop=pd.concat([df1,df2])
    print('Raw input shape prop', df_prop.shape)

    #Pre-processing
    df_prop.loc[:,'dok_id']=df_prop.loc[:,'1'].str.lower()
    df_prop.set_index('dok_id', inplace=True)
    df_prop.index.rename('filename', inplace=True)
    df_prop=df_prop.rename(columns={'2':'rm','4':'doktyp','8':'Departement','9':'Utskott','13':'Titel'})

    # Filter out
    df_prop=df_prop.loc[~df_prop['Titel'].str.contains(FILTER_TITLE,flags=re.I),KEEP_COL]
    print('Output shape prop:',df_prop.shape)

    ######################### SOU #########################
    df1=pd.read_csv(PATH_TO_SOU / 'sou-2015-.csv')
    df2=pd.read_csv(PATH_TO_SOU /'sou-2020-.csv')
    df_sou=pd.concat([df1,df2])
    print('Raw input shape SOU', df_sou.shape)

    #Pre-processing
    df_sou.loc[:,'dok_id']=df_sou.loc[:,'1'].str.lower()
    df_sou.set_index('dok_id', inplace=True)
    df_sou.index.rename('filename', inplace=True)
    df_sou=df_sou.rename(columns={'2':'rm','4':'doktyp','8':'Departement','9':'Utskott','13':'Titel'})

    # Concatenate prop and sou
    df = pd.concat([df_prop,df_sou[KEEP_COL]]) 

    # Assert rm is string
    df['rm']=df['rm'].astype(str)

    print('Concatenated shape', df.shape)

    return df

def add_space(text):
    """ Adds space before (suspected) chapter"""
    return re.sub('\.\s*(\n[A-Ö\d][\.]?\w*:?( [\w\-–:]*){0,7}\s*?\n+)[A-Ö]',r'\n\1',text)

def remove_chapnumber(text):
    """Remove chapter number on single row
    Returns cleaned text"""
    return re.sub('\n\d+\.\d+\n|\n\d+\n','',text)

def avstava(text):
    """Handle hyphenation (when approriate)"""

    # Ta bort sidnummer om det kommer in i en avstavning
    text=re.sub('([a-ö]{2,}-\s*)(\d+)',r'\1',text)

    streck=re.compile(r"""  (?!it)          # Matcha inte på it
                            (               # Grupp 1 innehåller grupp 2 till 4:
                                ([a-ö]{2,})     # Grupp 2 första delen av det eventuellt avstavade ordet (kräver minst 2 bokstäver)
                                (-\s*\d*\s*)    # Grupp 3 vill vi ta bort från hela grupp 0 (fångar även insprängda sidnummer)
                                ([a-ö]*)        # Grupp 4 Sista delen i det eventuellt avstavade ordet
                                )"""            # Notera att vi, i grupp 4, endast matchar på små bokstäver från a till ö
        ,                                       
        re.X + re.U
    )       
        
    def dashrepl(matchobj):
        """ Fixar avstavningar """
        if matchobj.group(4) is None:
            return matchobj.group(0)
        elif matchobj.group(2).islower():
            if (matchobj.group(2) in ['bnp','tfp']) or (matchobj.group(4) in ['och','eller']): # Manuella tillägg av godkända prefix och ord efter bindesstrecket som är tillåtna.
                return matchobj.group(0)
            else:
                return matchobj.group(2) + matchobj.group(4)
        else:
            return matchobj.group(0) 

    return re.sub(streck,dashrepl,text)

def cleaner(text):
    """Chained cleaner functions"""
    return add_space(remove_chapnumber(avstava(text)))

def import_and_clean_txt(df,clean=True):
    for ids, row in tqdm(df.iterrows(), total=df.shape[0]):
        
        if row['doktyp']=='sou':
            PATH_TO_FILES=PATH_TO_SOU
        else:
            PATH_TO_FILES=PATH_TO_PROP

        try:    
            with open(PATH_TO_FILES / (ids + '.txt'), encoding='utf8') as file:
                if clean:
                    df.at[ids,'text']=cleaner(file.read())
                else:
                    df.at[ids,'text']=file.read()
        except FileNotFoundError as ex:
            df.at[ids,'text']='' # File not found
    print('Empty documents:')
    print(df.loc[df.text=='','Titel'])
    print('Removing empty documents...')

    return df.loc[~(df.text=='')]

def get_chunks(text_input,step=512,overlap=64):
    """Chunk document into subparts/docparts (dokdelar)"""

    if overlap>=step:
        print("Error! overlap should be smaller than step! Otherwise you'll get into an infinite loop. default values are used")
        overlap=64
        step=512

    tokens=text_input.split()
    #print('Number of documents to split:',len(tokens))
    out_list=[]
    start=0
    end=start+step
    if len(tokens)>(end):
            
        while end<len(tokens):
            end=min(start + step,len(tokens))
            out_list.append(" ".join(tokens[start:end]))
            start=start+step-overlap
    else:
        out_list.append(" ".join(tokens))

    return out_list 

def get_deldokument(df):
    original_df_length=df.shape[0]
    df['dokdelar']=df['text'].progress_apply(get_chunks) # apply chunker on all documents
    df=df[['Titel','rm','doktyp','Departement','Utskott','dokdelar']].explode('dokdelar') # Remove entire text
    df=df.set_index(df.index + '_' +df.groupby(df.index).cumcount().astype(str)) # Construct unique index
    dokdelar_df_length=df.shape[0]
    print('The original list of docuents was:', original_df_length)
    print('The chunked list of documents are:', dokdelar_df_length)
    print(f'Each document was split into {dokdelar_df_length/original_df_length:.1f} subparts on average.')
    return df


## 1. Kör detta om det är första gången du starta projektet
__assert_folders()__ sätter upp rät mappstruktur

__download_starter_pack()__ laddar ner  zipfiler med csv och text-filer i rätt mappar

In [4]:
assert_folders() # Creates folders if needed
downoad_starter_pack()

Extracting txt-files...
Extracting txt-files...
Extracting txt-files...
Extracting txt-files...


## 2. Även denna cell behöver du bara köra en gång
__read_to_df()__ läser in metadata från csv-filerna

__import_and_clean()__ lägger till alla textdokument till datatabellen *df* och gör vissa rensningar av texterna

Sista cellen sparar filen som en pickle.

In [5]:
df=read_to_df(FILTER_TITLE=FILTER_TITLE)
df=import_and_clean_txt(df)
print('df.shape:',df.shape)

Raw input shape prop (1875, 18)
Output shape prop: (1688, 5)
Raw input shape SOU (784, 18)
Concatenated shape (2472, 5)


  0%|          | 0/2472 [00:00<?, ?it/s]

Empty documents:
filename
h70370    Integritetsskydd vid signalspaning i försvarsu...
Name: Titel, dtype: object
Removing empty documents...
df.shape: (2471, 6)


In [6]:
# Saving the dataframe prior to chunking up the documents (just in case...)
df.to_pickle(PATH_TO_OUTDATA / 'data.pkl')

In [7]:
# Split documents by a rolling window
#df=pd.read_pickle(PATH_TO_OUTDATA / 'data.pkl')
df=get_deldokument(df)

  0%|          | 0/2471 [00:00<?, ?it/s]

Number of documents to split: 25943
Number of documents to split: 154784
Number of documents to split: 30267
Number of documents to split: 77744
Number of documents to split: 61828
Number of documents to split: 29548
Number of documents to split: 64548
Number of documents to split: 53183
Number of documents to split: 42360
Number of documents to split: 26639
Number of documents to split: 25258
Number of documents to split: 23699
Number of documents to split: 18497
Number of documents to split: 13484
Number of documents to split: 2902
Number of documents to split: 29283
Number of documents to split: 70202
Number of documents to split: 27096
Number of documents to split: 26759
Number of documents to split: 22219
Number of documents to split: 18657
Number of documents to split: 10328
Number of documents to split: 27063
Number of documents to split: 26826
Number of documents to split: 17698
Number of documents to split: 23276
Number of documents to split: 35609
Number of documents to split

In [8]:
# Saving the chunked documents in both pickle and parquet (ligthweight format)
df.to_pickle(PATH_TO_OUTDATA / 'data_dokdelar.pkl')
df.to_parquet(PATH_TO_OUTDATA / 'data_dokdelar.pqt')

# EXTRA
## Downloads a large set of meta data for SOU and prop
This is in case you ONLY want to analyze meta data WITHOUT the actual document texts.
This is currently not used...

In [48]:
#downoad_download_meta()

df_meta=read_meta_to_df()
df_meta.to_pickle(PATH_TO_OUTDATA / 'meta_prop_sou.pkl')
df_meta['subtyp'].value_counts(dropna=False)

Raw input shape (6940, 16)
No filters where used.
Column names: ['hangar_id', 'dok_id', 'rm', 'beteckning', 'doktyp', 'typ', 'subtyp', 'tempbeteckning', 'organ', 'mottagare', 'nummer', 'datum', 'systemdatum', 'titel', 'subtitel', 'status', 'relaterat_id']


sou     3257
prop    3019
skr      624
NaN       40
Name: subtyp, dtype: int64

In [49]:
df_meta.head()

Unnamed: 0_level_0,dok_id,rm,beteckning,doktyp,typ,subtyp,tempbeteckning,organ,mottagare,nummer,datum,systemdatum,titel,subtitel,status,relaterat_id
hangar_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2222687,GM031,1998/99,1,prop,prop,prop,,Finansdepartementet,,1,1999-01-01 00:00:00,2006-10-23 14:13:40,Budgetpropositionen för 1999,"Förslag till statsbudget, finansplan m.m. (1. ...",,
2291126,GM0310,1998/99,10,prop,prop,prop,,Justitiedepartementet,,10,1999-01-01 00:00:00,2006-10-23 14:13:40,Ändringar i rättshjälpslagen,,,
2294100,GM03100,1998/99,100,prop,prop,prop,,Finansdepartementet,,100,1999-01-01 00:00:00,2006-10-23 14:13:40,1999 års ekonomiska vårproposition,,,
2457451,GM03100D1,1998/99,100D1,prop,prop,prop,100.1,Finansdepartementet,,100,1999-01-01 00:00:00,2006-10-23 14:13:40,1999 års ekonomiska vårproposition,Svensk ekonomi,,
2457452,GM03100D2,1998/99,100D2,prop,prop,prop,100.2,Finansdepartementet,,100,1999-01-01 00:00:00,2006-10-23 14:13:40,1999 års ekonomiska vårproposition,Avstämning av målet om en halverad öppen arbet...,,
