In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import spacy

In [652]:
def remove_labels(doc, labels:list):
        text_no_locations = ''
        for token in doc:
            if token.ent_type_ not in labels:
                text_no_locations += token.text
                if token.whitespace_:
                    text_no_locations += ' '
        return text_no_locations

def retrieve_tokens(row, pos_to_remove, labels_to_remove):
    return [token.lemma_.lower() for token in row if token.pos_ not in pos_to_remove 
                                                                      and not token.is_stop 
                                                                      and token.is_alpha 
                                                                      and token.ent_type_ not in labels_to_remove]

In [661]:
def regex_input(row):
    # Remove unicode and webpages
    regexp4 = r'\b(?<![\\\.])\w+(?!\.\w+)\b'


    result = BeautifulSoup(row, 'lxml').get_text()
    result = re.findall(regexp4, result)

    return " ".join(result)

def process_input(df):
    nlp = spacy.load("sv_core_news_sm")

    #labels_to_remove = ['TME', 'MSR']
    labels_to_remove = ['TME']
    pos_to_remove = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

    token_list = []
    unique_token_list = []
    entity_list = []
    label_list = []
    text_list = []



    for index, row in df.iterrows():
        
        doc_text = nlp(row['Text'])
        doc_title = nlp(row['Title'])
        
        doc_ents = doc_text.ents + doc_title.ents

        title_tokens = retrieve_tokens(doc_title, pos_to_remove, labels_to_remove)
        text_tokens = retrieve_tokens(doc_text, pos_to_remove, labels_to_remove)
        
        doc_tokens = title_tokens + text_tokens
        unique_doc_tokens = set(doc_tokens)
        # Remove duplicate entities from the list
        unique_ents = list({keyword.__repr__(): keyword for keyword in doc_ents}.values())
        
        # Remove entities that have label 'TME' or 'LOC' because they are mostly redundant
        # And the text will be changed to not contain words with these labels
        unique_ents = [ent for ent in unique_ents if not ent.label_ in labels_to_remove]    
           
        entities = [str(x) for x in unique_ents]
        labels = [str(labels.label_) for labels in unique_ents]
        
        # Remove words that contain the specified labels
        label_free_text = remove_labels(doc_text, labels=labels_to_remove)

        entity_list.append(entities)
        label_list.append(labels)
        token_list.append(doc_tokens)
        unique_token_list.append(unique_doc_tokens)
        
        df.at[index,['Text']] = label_free_text
        
        #if index > 1:
        #    break
        
    df['Entities'] = pd.Series(entity_list)
    df['Labels'] = pd.Series(label_list)
    df['Tokens'] = pd.Series(token_list)
    df['Unique_Tokens'] = pd.Series(unique_token_list)


    return df

In [662]:
from os.path import exists

if(not exists('preprocessed_articles.csv')):  
    df_articles = pd.read_csv('sävsjö_articles.csv')
    df_articles = df_articles.dropna().reset_index(drop=True)
    df_processed = df_articles.copy()

    df_processed['Title'] = df_processed['Title'].apply(lambda x: regex_input(x))
    df_processed['Text'] = df_processed['Text'].apply(lambda x: regex_input(x))

    df_processed = process_input(df_processed)

    df_processed.to_csv('preprocessed_articles.csv')
    
else:
    df_processed = pd.read_csv('preprocessed_articles.csv')



In [666]:
df_processed.head()

Unnamed: 0,ID,Title,Text,Entities,Labels,Tokens,Unique_Tokens
0,2,Aktiespararna säger ja till Gunvorbudet,Aktiespararna rekommenderar sina medlemmar att...,"[Gunvor Group, Rörvik]","[PRS, ORG]","[aktiespararna, gunvorbudet, aktiespararna, re...","{group, medlem, gunvor, aktiespararna, anta, r..."
1,7,Man fast i en timme i grop,En man i 60 årsåldern föll ner i en grop som v...,[Sävsjö],[LOC],"[timme, grop, årsålder, föll, grop, grävd, ege...","{sävsjö, hål, djup, timme, grop, egen, föll, l..."
2,8,Mannen fastnade i en grop i flera timmar,En man i 60 årsåldern trillade ner i ett grävt...,[David],[PRS],"[mannen, fastna, grop, timme, årsålder, trilla...","{hål, grävt, mannen, timme, grop, trilla, fast..."
3,56,En motorcykel,Här är den längre versionen,[],[],"[motorcykel, vara, version]","{vara, motorcykel, version}"
4,66,Stulna braskaminer för 50 000 ska värma,Det börjar bli vinter och kallt Då är det skön...,"[Smålandsvillan, Vrigstad]","[ORG, LOC]","[stulna, braskamin, värma, börja, vinter, kall...","{kall, helg, braskamin, vara, försvann, vinter..."


In [664]:
df_processed['Text'][5]

'Sävsjö kommun ska få en ny skolchef Han heter Stefan Claesson och jobbar just nu i Jönköping När han ska börja basa över läre och elever i Sävsjö är inte klart det är en förhandlingsfråga mellan nye skolchefen och hans hittillsvarande arbetsgivare'

# remove empty rows and convert to lists.

### Not used so far

In [None]:
""" Check for a value in a dataframe column"""
def check_value(df, column, value):
    return df[df[column].str.contains(value, na=False)]
check_value(df_processed, 'Entities', 'Smålandsvillan')

In [None]:
"""
df_test = df_articles[["Labels","Entities"]]

for ind, row in df_test.iterrows():
    if row["Labels"] == "[]":
        df_test = df_test.drop(index=ind)

df_test[["Labels","Entities"]].apply(lambda x : str(x).split(','))
"""