In [35]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import spacy
import re

In [36]:
""" NOT USED AT THIS TIME
def remove_labels(doc, labels_to_remove:list):
        text_no_labels = ''
        for token in doc:
            if token.ent_type_ not in labels_to_remove:
                text_no_labels += token.text
                if token.whitespace_:
                    text_no_labels += ' '
        return text_no_labels
"""
def retrieve_tokens(row, pos_to_remove, labels_to_remove):
    return [token.lemma_.lower() for token in row if token.pos_ not in pos_to_remove 
                                                                and not token.is_stop 
                                                                and token.is_alpha 
                                                                and token.ent_type_ not in labels_to_remove]

In [37]:
def regex_input(row):
    # Remove unicode and webpages
    regexp = r'\b(?<![\\\.])\w+(?!\.\w+)\b'

    # Remove html tags
    result = BeautifulSoup(row, 'lxml').get_text()
    result = re.findall(regexp, result)

    return " ".join(result)

def process_input(df):
    nlp = spacy.load("sv_core_news_sm")

    #labels_to_remove = ['TME', 'MSR']
    labels_to_remove = ['TME']
    pos_to_remove = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

    token_list = []
    unique_token_list = []
    entity_list = []
    label_list = []
    text_list = []

    for index, row in df.iterrows():
        
        doc_text = nlp(row['Text'])
        doc_title = nlp(row['Title'])
        
        doc_ents = doc_text.ents + doc_title.ents

        title_tokens = retrieve_tokens(doc_title, pos_to_remove, labels_to_remove)
        text_tokens = retrieve_tokens(doc_text, pos_to_remove, labels_to_remove)
        
        doc_tokens = title_tokens + text_tokens
        unique_doc_tokens = set(doc_tokens)
        # Remove duplicate entities from the list
        unique_ents = list({keyword.__repr__(): keyword for keyword in doc_ents}.values())
        
        # Remove entities that have label 'TME' because they are mostly redundant (e.g., 'In the morning')
        # And the text will be changed to not contain words with these labels
        unique_ents = [ent for ent in unique_ents if not ent.label_ in labels_to_remove]    
           
        entities = [str(x) for x in unique_ents]
        labels = [str(labels.label_) for labels in unique_ents]

        entity_list.append(entities)
        label_list.append(labels)
        token_list.append(doc_tokens)
        unique_token_list.append(unique_doc_tokens)
        
        
    df['Entities'] = pd.Series(entity_list)
    df['Labels'] = pd.Series(label_list)
    df['Tokens'] = pd.Series(token_list)
    df['Unique_Tokens'] = pd.Series(unique_token_list)


    return df

In [38]:
from os.path import exists

if(not exists('preprocessed_articles.csv')):  
    df_articles = pd.read_csv('sävsjö_articles.csv')
    df_articles = df_articles.dropna().reset_index(drop=True)
    df_processed = df_articles.copy()

    df_processed['Title'] = df_processed['Title'].apply(lambda x: regex_input(x))
    df_processed['Text'] = df_processed['Text'].apply(lambda x: regex_input(x))

    df_processed = process_input(df_processed)

    df_processed.to_csv('preprocessed_articles.csv')
    
else:
    df_processed = pd.read_csv('preprocessed_articles.csv')



In [72]:
df_processed.head(20)

Unnamed: 0,ID,Title,Text,Entities,Labels,Tokens,Unique_Tokens
0,2,Aktiespararna säger ja till Gunvorbudet,Aktiespararna rekommenderar sina medlemmar att...,"[Gunvor Group, Rörvik]","[PRS, ORG]","[aktiespararna, gunvorbudet, aktiespararna, re...","{timber, medlem, anta, lägga, rekommendera, an..."
1,7,Man fast i en timme i grop,En man i 60 årsåldern föll ner i en grop som v...,[Sävsjö],[LOC],"[timme, grop, årsålder, föll, grop, grävd, ege...","{timme, meter, hål, djup, ligga, grop, tomt, å..."
2,8,Mannen fastnade i en grop i flera timmar,En man i 60 årsåldern trillade ner i ett grävt...,[David],[PRS],"[mannen, fastna, grop, timme, årsålder, trilla...","{mannen, fastna, timme, hitta, stund, grävt, h..."
3,56,En motorcykel,Här är den längre versionen,[],[],"[motorcykel, vara, version]","{motorcykel, version, vara}"
4,66,Stulna braskaminer för 50 000 ska värma,Det börjar bli vinter och kallt Då är det skön...,"[Smålandsvillan, Vrigstad]","[ORG, LOC]","[stulna, braskamin, värma, börja, vinter, kall...","{vrigstad, helg, vara, skön, uppenbarligen, kr..."
5,67,Sävsjö får ny skolchef,Sävsjö kommun ska få en ny skolchef Han heter ...,"[Stefan Claesson, Jönköping, Sävsjö]","[PRS, LOC, LOC]","[sävsjö, ny, skolchef, sävsjö, kommun, ny, sko...","{basa, skolchef, klar, claesson, hittillsvaran..."
6,70,Krösatågen kan bli försenade,Ett signalfel på sträckan Södertälje Stockholm...,"[Södertälje Stockholm, Länstrafiken]","[LOC, LOC]","[krösatåg, försena, signalfel, sträcka, södert...","{beröra, tågtrafik, trafik, krösatåg, ta, vara..."
7,72,Sävsjöpensionärer gör uppror,Stora grupper av pensionärer i Sävsjö är på kr...,"[Sävsjö, Vrigstad]","[LOC, LOC]","[sävsjöpensionär, uppra, grupp, pensionär, säv...","{vrigstad, kommunikation, dramatisk, orion, gr..."
8,78,Linus testar,Ett test jdklsa jfkldsa sjfkldsaö jfklsdaöjfls...,[Linus],[PRS],"[linus, testa, test, jdklsa, jfkldsa, sjfkldsa...","{testa, test, jfklsdaöjflskad, sjfkldsaö, jdkl..."
9,79,Linus testar 2,Testar 2,[],[],"[linus, testa, testar]","{testar, testa, linus}"


# remove empty rows and convert to lists.

In [41]:
"""
df_test = df_articles[["Labels","Entities"]]

for ind, row in df_test.iterrows():
    if row["Labels"] == "[]":
        df_test = df_test.drop(index=ind)

df_test[["Labels","Entities"]].apply(lambda x : str(x).split(','))
"""

'\ndf_test = df_articles[["Labels","Entities"]]\n\nfor ind, row in df_test.iterrows():\n    if row["Labels"] == "[]":\n        df_test = df_test.drop(index=ind)\n\ndf_test[["Labels","Entities"]].apply(lambda x : str(x).split(\',\'))\n'