In [225]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [226]:
import re

def preprocess_input(input):
    #regexp = r'[\\\/\.,;:()]+\w*'
    #regexp2 = r'\b(?<![\\\/\.\,\\:\;\(\)\+\-])\w+\b'
    #regexp3 = r'\b(?<![\\\/])\w+\.*\w*\b'

    # Remove unicode and webpages
    regexp4 = r'\b(?<![\\\.])\w+(?!\.\w+)\b'

    # Regex for removing words that are preceded by '\'
    unicode_regexp = r'\b(?<![\\])\w+\b'

    # Regex for removing webpages
    webpage_regexp = r'\b(?<![\.])\w+(?!\.\w+)\b'

    # Remove html tags, 'lxml' more robust than 'html.parser'
    result = BeautifulSoup(input, 'lxml').get_text()
    


    # Save words that are not preceded by a '\' and webpages
    result = re.findall(regexp4, result)
    
            # TRYING TO COMBINE MULTIPLE REGEX, WORK IN PROGRESS
            #patterns = [unicode_regexp, webpage_regexp]
            #pattern = "|".join(patterns)  # pattern1|pattern2|pattern3|...
            #re.findall(pattern, result)

    # Remove excessive whitespaces
    result = " ".join(result)


    #result = " ".join(result.split())
   
    return result

In [251]:
from os.path import exists

if(not exists('preprocessed_articles.csv')):  
    df_articles = pd.read_csv('sävsjö_articles.csv')
    df_articles = df_articles.dropna()
    # columns: ['ID', 'Title', 'Text']
    # iterate over 'Title' and 'Text' columns and preprocess the texts
    for col in df_articles.columns[1:]:
        df_articles[col] = df_articles[col].astype(str)
        df_articles[col] = df_articles[col].apply(lambda x: preprocess_input(x))
    
else:
    df_processed = pd.read_csv('preprocessed_articles.csv')



In [228]:
df_articles.head()

Unnamed: 0,ID,Title,Text
0,2,Aktiespararna säger ja till Gunvorbudet,Aktiespararna rekommenderar sina medlemmar att...
2,7,Man fast i en timme i grop,En man i 60 årsåldern föll ner i en grop som v...
3,8,Mannen fastnade i en grop i flera timmar,En man i 60 årsåldern trillade ner i ett grävt...
4,56,En motorcykel,Här är den längre versionen
5,66,Stulna braskaminer för 50 000 ska värma,Det börjar bli vinter och kallt Då är det skön...


In [229]:
def remove_labels(doc, labels:list):
        text_no_locations = ''
        for token in doc:
            if token.ent_type_ not in labels:
                text_no_locations += token.text
                if token.whitespace_:
                    text_no_locations += ' '
        return text_no_locations


In [312]:
#from spacy.pipeline.ner import EntityRecognizer
from numpy import dtype
import spacy
from os.path import exists
np.warnings.filterwarnings('error', category=np.VisibleDeprecationWarning) 

if(not exists('preprocessed_articles.csv')):
    df_processed = df_articles.drop(columns=['Title']).copy()
    nlp = spacy.load("sv_core_news_sm")
    labels_to_remove = ['LOC', 'TME']

    # Does not seem to work?
    #df_processed['Entities'] = np.empty((len(df_processed), 0)).tolist()

    entity_list = []
    label_list = []
    
    for index, row in df_articles.iterrows():
        doc_text = nlp(row['Text'])
        #doc_text = nlp(remove_labels(doc_text, labels=labels_to_remove))

        doc_title = nlp(row['Title'])
        #doc_title = nlp(remove_labels(doc_title, labels=labels_to_remove))

        doc_ents = doc_text.ents + doc_title.ents

        # Remove duplicate entities from the list
        unique_ents = list({keyword.__repr__(): keyword for keyword in doc_ents}.values())
        
        #entity_list.add(str(unique_ents))
        # Remove entities that have label 'TME' or 'LOC' because they are mostly redundant
        # And the text will be changed to not contain words with these labels
        unique_ents = [ent for ent in unique_ents if not ent.label_ in labels_to_remove]    
           
        entities = [str(x) for x in unique_ents]
        labels = [str(labels.label_) for labels in unique_ents]
        
        # Remove words that contain the specified labels
        label_free_text = remove_labels(doc_text, labels=labels_to_remove)
     
        entity_list.append(entities)
        label_list.append(labels)
        
        df_processed.at[index,['Text']] = label_free_text
        
    df_processed['Entities'] = pd.Series(entity_list)
    df_processed['Labels'] = pd.Series(label_list)

    df_processed.to_csv('preprocessed_articles.csv')

df_processed.head(10)


Unnamed: 0,ID,Text,Entities,Labels
0,2,Aktiespararna rekommenderar sina medlemmar att...,"[Gunvor Group, Rörvik]","[PRS, ORG]"
2,7,En man i 60 årsåldern föll ner i en grop som v...,[David],[PRS]
3,8,En man i 60 årsåldern trillade ner i ett grävt...,[],[]
4,56,Här är den längre versionen,[Smålandsvillan],[ORG]
5,66,Det börjar bli vinter och kallt Då är det skön...,[Stefan Claesson],[PRS]
6,67,Sävsjö kommun ska få en ny skolchef Han heter ...,[],[]
7,70,Ett signalfel på sträckan på fredagsmorgonen s...,[],[]
8,72,Stora grupper av pensionärer i är på krigsstig...,[Linus],[PRS]
9,78,Ett test jdklsa jfkldsa sjfkldsaö jfklsdaöjfls...,[],[]
10,79,Testar 2,[],[]


# remove empty rows and convert to lists.

### Not used so far

In [207]:
"""
df_test = df_articles[["Labels","Entities"]]

for ind, row in df_test.iterrows():
    if row["Labels"] == "[]":
        df_test = df_test.drop(index=ind)

df_test[["Labels","Entities"]].apply(lambda x : str(x).split(','))
"""

Unnamed: 0,Labels,Entities
0,0 ['PRS',0 ['Gunvor Group'
1,'ORG']\n2 ['LOC']\n3 ...,'Rörvik']\n2 ...
2,'LOC']\n6 ['PRS','Vrigstad']\n6 ['Stefan Claesson'
3,'LOC','Jönköping'
4,'LOC']\n ... \n8600 ...,'Sävsjö']\n ... ...
5,Length: 8600,Length: 8600
6,dtype: object,dtype: object
