Imports:

In [16]:
import pandas as pd
import spacy
import re
from spellchecker import SpellChecker
import unicodedata
nlp = spacy.load("en_core_web_sm")

In [17]:
dataOrigin = pd.read_csv("20 newsgroups/20newsgroups.csv", on_bad_lines='skip', sep=';')
dataOrigin = dataOrigin.dropna()
dataOrigin = dataOrigin.drop_duplicates()
dataOrigin.head()

Unnamed: 0,ID,text,group
0,0,"'Hi, I\'ve noticed that if you only save a mo...",1
1,1,"' Seems to be, barring evidence to the contra...",3
2,2,' >In article <1993Apr19.020359.26996@sq.sq.c...,2
3,3,'I have a request for those who would like to ...,0
4,4,'AW&ST had a brief blurb on a Manned Lunar Ex...,2


Preprocessing Methods:

Text "lematisieren" (worked -> work)

In [18]:
def lemma(text):
    doc = nlp(text)
    text = " ".join(token.lemma_ for token in doc)
    return text

Stop Words entfernen

In [19]:
def stop_words(text):
    doc = nlp(text)
    text = " ".join([token.text for token in doc if not token.is_stop])
    return text

Punkte entfernen

In [20]:
def punct(text):
    doc = nlp(text)
    text = ' '.join(token.text for token in doc if not token.is_punct)
    return text

Mehrer aufeinander folgende Leerzeichen, Tabs, Zeilenumbrüche entfernen:

In [21]:
def space(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

Bestimmte Sonderzeichen entfernen (@,#,$, ...) -> alle außer denen in der [] Klammer:

In [22]:
def sonderzeichen(text):
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:()""\'-]', '', text)
    return text

Nur Kleinschreibung:

In [23]:
def lower(text):
    doc = nlp(text)
    text = ' '.join(token.lower_ for token in doc)
    return text

Nur Nomen:

In [24]:
def nouns(text):
    doc = nlp(text)
    text = ' '.join([token.text for token in doc if token.pos_ == "NOUN"])
    return text

Nur Verben:

In [25]:
def verbs(text):
    doc = nlp(text)
    text = ' '.join([token.text for token in doc if token.pos_ == "VERB"])
    return text

Rechtschreibfehler entfernen

In [26]:
spell = SpellChecker()
def korrigiere_text(text):
    doc = nlp(text)
    worte = text.split()
    korrigierte_worte = [spell.correction(wort) if spell.correction(wort) is not None else wort for wort in worte]
    text = ' '.join(korrigierte_worte)
    return text

Text Normalisieren

In [27]:
def normalisieren(text):
    return unicodedata.normalize('NFC', text)

Kurze Wörter entfernen

In [28]:
def entferne_kurze_woerter(text, min_laenge=4):
    woerter = text.split()
    gefilterte_woerter = [wort for wort in woerter if len(wort) >= min_laenge]
    text =  ' '.join(gefilterte_woerter)
    return text

URLs und Mails entfernen

In [None]:
def ohneUrlMail(text):

    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    text = re.sub(url_pattern, '', text)
    text = re.sub(email_pattern, '', text)
    
    return text

Verschiedene Methoden kombinieren

In [39]:
def kombi (text, min_laenge=3):
        text = ohneUrlMail(text)
        text = stop_words(text)
        text = entferne_kurze_woerter(text,min_laenge)
        text = sonderzeichen(text)
        text = punct(text)
        text = space(text)
        text = lower(text)
        text = normalisieren(text)
        text = lemma(text)

        return text

Usage:

In [None]:
dataLemma = dataOrigin
dataLemma['text'] = dataLemma['text'].astype(str).apply(lemma)
dataLemma.to_csv("preprocessed/dataLemma.csv", index=False, sep=";")
dataLemmaTest = pd.read_csv("preprocessed/dataLemma.csv", on_bad_lines='skip', sep=';')
dataLemmaTest.head()

In [None]:
dataSpace = dataOrigin
dataSpace['text'] = dataSpace['text'].astype(str).apply(space)
dataSpace.to_csv("preprocessed/dataSpace.csv", index=False, sep=";")
dataSpaceTest = pd.read_csv("preprocessed/dataSpace.csv", on_bad_lines='skip', sep=';')
dataSpaceTest.head()

Unnamed: 0,ID,text,group
0,0,"' hi , i\'ve notice that if you only save a mo...",1
1,1,"' seem to be , bar evidence to the contrary , ...",3
2,2,' > in article < 1993apr19.020359.26996@sq.sq....,2
3,3,' I have a request for those who would like to...,0
4,4,' AW&ST have a brief blurb on a Manned Lunar E...,2


In [None]:
dataSonderzeichen = dataOrigin
dataSonderzeichen['text'] = dataSonderzeichen['text'].astype(str).apply(sonderzeichen)
dataSonderzeichen.to_csv("preprocessed/dataSonderzeichen.csv", index=False, sep=";")
dataSonderzeichenTest = pd.read_csv("preprocessed/dataSonderzeichen.csv", on_bad_lines='skip', sep=';')
dataSonderzeichenTest.head()

Unnamed: 0,ID,text,group
0,0,"' hi , i've notice that if you only save a mod...",1
1,1,"' seem to be , bar evidence to the contrary , ...",3
2,2,' in article 1993apr19.020359.26996sq.sq.com...,2
3,3,' I have a request for those who would like to...,0
4,4,' AWST have a brief blurb on a Manned Lunar Ex...,2


In [None]:
dataPunct = dataOrigin
dataPunct['text'] = dataPunct['text'].astype(str).apply(punct)
dataPunct.to_csv("preprocessed/dataPunct.csv", index=False, sep=";")
dataPunctTest = pd.read_csv("preprocessed/dataPunct.csv", on_bad_lines='skip', sep=';')
dataPunctTest.head()

Unnamed: 0,ID,text,group
0,0,hi i 've notice that if you only save a model ...,1
1,1,seem to be bar evidence to the contrary that K...,3
2,2,in article 1993apr19.020359.26996sq.sq.com...,2
3,3,I have a request for those who would like to s...,0
4,4,AWST have a brief blurb on a Manned Lunar Expl...,2


In [None]:
dataStop = dataOrigin
dataStop['text'] = dataStop['text'].astype(str).apply(stop_words)
dataStop.to_csv("preprocessed/dataStop.csv", index=False, sep=";")
dataStopTest = pd.read_csv("preprocessed/dataStop.csv", on_bad_lines='skip', sep=';')
dataStopTest.head()

Unnamed: 0,ID,text,group
0,0,hi ' ve notice save model mapping plane positi...,1
1,1,bar evidence contrary Koresh simply derange fa...,3
2,2,article 1993apr19.020359.26996sq.sq.com ...,2
3,3,request like Charley Wingate respond Charley C...,0
4,4,AWST brief blurb Manned Lunar Exploration conf...,2


In [None]:
dataLower = dataOrigin
dataLower['text'] = dataLower['text'].astype(str).apply(lower)
dataLower.to_csv("preprocessed/dataLower.csv", index=False, sep=";")
dataLowerTest = pd.read_csv("preprocessed/dataLower.csv", on_bad_lines='skip', sep=';')
dataLowerTest.head()

Unnamed: 0,ID,text,group
0,0,hi ' ve notice save model mapping plane positi...,1
1,1,bar evidence contrary koresh simply derange fa...,3
2,2,article 1993apr19.020359.26996sq.sq.co...,2
3,3,request like charley wingate respond charley c...,0
4,4,awst brief blurb manned lunar exploration conf...,2


In [None]:
dataNoun = dataOrigin
dataNoun['text'] = dataNoun['text'].astype(str).apply(nouns)
dataNoun.to_csv("preprocessed/dataNoun.csv", index=False, sep=";")
dataNounTest = pd.read_csv("preprocessed/dataNoun.csv", on_bad_lines='skip', sep=';')
dataNounTest.head()

Unnamed: 0,ID,text,group
0,0,notice model mapping plane position file reloa...,1
1,1,bar evidence koresh folk child satisfy fruitca...,3
2,2,article figure perijoves talk language periaps...,2
3,3,request mail article ingore challenges dozen a...,0
4,4,blurb exploration confernce 7th know,2


In [None]:
dataVerb = dataOrigin
dataVerb['text'] = dataVerb['text'].astype(str).apply(verbs)
dataVerb.to_csv("preprocessed/dataVerb.csv", index=False, sep=";")
dataVerbTest = pd.read_csv("preprocessed/dataVerb.csv", on_bad_lines='skip', sep=';')
dataVerbTest.head()

Unnamed: 0,ID,text,group
0,0,noticed save positioned reload restarting give...,1
1,1,Seems barring deranged thought take satisfy de...,3
2,2,MB seems used talking say,2
3,3,have like see judging appear intends continue ...,0
4,4,had know attend want go,2


Nicht neu ausführen, dauer ewig

In [None]:
# dauert zu lange, evtl. anders implementieren
dataCorrected = dataOrigin
dataCorrected['text'] = dataCorrected['text'].astype(str).apply(korrigiere_text)
dataCorrected.to_csv("preprocessed/dataRechtschreibung.csv", index=False, sep=";")
dataCorrectedTest = pd.read_csv("preprocessed/dataRechtschreibung.csv", on_bad_lines='skip', sep=';')
dataCorrectedTest.head()

In [None]:
dataNormal = dataOrigin
dataNormal['text'] = dataNormal['text'].astype(str).apply(normalisieren)
dataNormal.to_csv("preprocessed/dataNormal.csv", index=False, sep=";")
dataNormalTest = pd.read_csv("preprocessed/dataNormal.csv", on_bad_lines='skip', sep=';')
dataNormalTest.head()

Unnamed: 0,ID,text,group
0,0,"'Hi, I\'ve noticed that if you only save a mo...",1
1,1,"' Seems to be, barring evidence to the contra...",3
2,2,' >In article <1993Apr19.020359.26996@sq.sq.c...,2
3,3,'I have a request for those who would like to ...,0
4,4,'AW&ST had a brief blurb on a Manned Lunar Ex...,2


In [None]:
dataKurze = dataOrigin
dataKurze['text'] = dataKurze['text'].astype(str).apply(entferne_kurze_woerter)
dataKurze.to_csv("preprocessed/dataKurze.csv", index=False, sep=";")
dataKurzeTest = pd.read_csv("preprocessed/dataKurze.csv", on_bad_lines='skip', sep=';')
dataKurzeTest.head()

Unnamed: 0,ID,text,group
0,0,"'Hi, I\'ve noticed that only save model (with ...",1
1,1,"Seems barring evidence contrary, that Koresh s...",3
2,2,"article <1993Apr19.020359.26996@sq.sq.com>, ms...",2
3,3,have request those would like Charley Wingate ...,0
4,4,'AW&ST brief blurb Manned Lunar Exploration co...,2


In [None]:
dataUrlMail = dataOrigin
dataUrlMail['text'] = dataUrlMail['text'].astype(str).apply(ohneUrlMail)
dataUrlMail.to_csv("preprocessed/dataUrlMail.csv", index=False, sep=";")
dataUrlMailTest = pd.read_csv("preprocessed/dataUrlMail.csv", on_bad_lines='skip', sep=';')
dataUrlMailTest.head()

Unnamed: 0,ID,text,group
0,0,"'Hi, I\'ve noticed that only save model (with ...",1
1,1,"Seems barring evidence contrary, that Koresh s...",3
2,2,"article <>, (Mark Brader) 1970 figure seems u...",2
3,3,have request those would like Charley Wingate ...,0
4,4,'AW&ST brief blurb Manned Lunar Exploration co...,2


In [None]:
dataKombi = dataOrigin
dataKombi['text'] = dataKombi['text'].astype(str).apply(kombi)
dataKombi.to_csv("preprocessed/dataKombi.csv", index=False, sep=";")
dataKombiTest = pd.read_csv("preprocessed/dataKombi.csv", on_bad_lines='skip', sep=';')
dataKombiTest.head()