In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk

In [3]:
tweets = pd.read_csv("../datasets/tweets.csv", sep=",")

In [4]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [5]:
tweets.label.value_counts()

label
1    5000
0    5000
Name: count, dtype: int64

In [6]:
tweets.loc[0, "tweets"]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [7]:
tweets.head()

Unnamed: 0,tweets,label
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,@DespiteOfficial we had a listen last night :)...,1
3,@97sides CONGRATS :),1
4,yeaaaah yippppy!!! my accnt verified rqst has...,1


In [8]:
testsätze = [
    "The sun is shining",
    "The weather is sweet",
    "The sun is shining and the weather is sweet sun"
]

In [9]:
count = CountVectorizer()
docs = np.array(testsätze)
bag = count.fit_transform(docs)

### Vocabulary

In [10]:
print(count.vocabulary_) ## Eigentlich müssten gewisse Preprocessing-Schritte vorgelagert sein --> Stop Words entfernen, UserNames etc.

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


### Merkmalsvektoren

One-Hot Codierte Arrays, welche anzeigen wie häufig bestimmte Wörter in einem Satz/Post/Dokument vorkommen

#### Monogramme

In [11]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [12]:
print(bag.toarray()) ##Merkmalsvektoren für jeden Satz

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 2 1 2 1]]


##### Bigramme

Durch N-Gramme oder diesem Fall Bigramme ist es möglich dem Text deutlich mehr Informationen zu entnehmen, da Wort- und Satzbau mit in Bezug genommen wird. Das Modell wird dadurch natürlich komplexer.

In [13]:
count = CountVectorizer(ngram_range=(2,2))
docs = np.array(testsätze)
bag = count.fit_transform(docs)

In [14]:
print(count.vocabulary_)

{'the sun': 6, 'sun is': 4, 'is shining': 1, 'the weather': 7, 'weather is': 8, 'is sweet': 2, 'shining and': 3, 'and the': 0, 'sweet sun': 5}


In [15]:
print(bag.toarray())

[[0 1 0 0 1 0 1 0 0]
 [0 0 1 0 0 0 0 1 1]
 [1 1 1 1 1 1 1 1 1]]


### Beurteilung von Wortrelevanz

#### Tf-idf-Maß

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [17]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2) # 2 Nachkommastellen
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.58 0.   0.   0.58 0.   0.58 0.   0.  ]
 [0.   0.   0.58 0.   0.   0.   0.   0.58 0.58]
 [0.39 0.3  0.3  0.39 0.3  0.39 0.3  0.3  0.3 ]]


### Tokenizer

Überführung von Wörtern in einzelne Wortabschnitte. Der Tokenizer von Porter überführt Wörter in ihre ursprüngliche Form um Vokabular zu verkleinern. Entweder entfernen von pre-/suffixen oder verwenden des Worstamms so wie es im Lexikon steht (Lemmatisierung).

In [18]:
satz = "I want to go to the mall !"

satz.split(" ")

['I', 'want', 'to', 'go', 'to', 'the', 'mall', '!']

##### Stemming/Lemmatisierung

In [19]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run') ##Manchmal kommen nicht vorhandene Wörter als Ergebnis heraus --> Beispiel thus

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

Leider gibt es bei der nltk library keine deutsche Implementierung, das heißt es muss auf andere Tools/Libraries zurückgegriffen werden. Ein Stemming-Ansatz für deutsches Vokabular wird in folgendem beschrieben: https://textmining.wp.hs-hannover.de/Preprocessing.html

#### Stopwords

In [20]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/mako6/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

new_list = []
words = tokenizer_porter('a runner likes running and runs a lot')
for w in words:
    if w not in stop:
        new_list.append(w)
new_list

##oder:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### Machine Learning Pipeline

manueller Train-/Test Split

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train,y_test = train_test_split(tweets.tweets, tweets.label, test_size=0.2, shuffle=True)

In [24]:
X_train

2756    Jah Wobble and the band tour dates :D https://...
7099                                    @tbhnswm wayhh :(
3592                             @JannikMarioFan Pech! :D
6438    Just another rumor :(\n\n#ZaynIsComingBackOnJu...
1562                        KEEP CALM AND STAY KEPO :) :D
                              ...                        
7851    @Natvolpato1 FOLLOWED ME THANKS, AND\n@justinb...
2975                        @Shana_Banana_44 Cheers :) xx
7501                          2nd thoughts on college :((
6038    @zoellaftmendes i'm gonna miss your tweet in m...
3048    @fairmnt this :) is :) so :) relevant :)))))))...
Name: tweets, Length: 8000, dtype: object

In [29]:
pd.DataFrame.sparse.from_spmatrix(tfidf.fit_transform(count.fit_transform(X_train)))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48467,48468,48469,48470,48471,48472,48473,48474,48475,48476
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

params= {
    "logisticregression__penalty": ['l1', 'l2'], 
    "logisticregression__C": [1.0,10.0,100.0] 
}

pipeline = make_pipeline(
    TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,ngram_range=(1,1),stop_words=stop,tokenizer=tokenizer_porter),
    LogisticRegression(random_state=0)
)

In [50]:
search = GridSearchCV(pipeline,params,cv=5)

In [51]:
search.fit(X_train,y_train)

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mako6/Library/Caches/pypoetry/virtualenvs/ki-grundlagen-hdp-2023-_qWOVShY-py3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mako6/Library/Caches/pypoetry/virtualenvs/ki-grundlagen-hdp-2023-_qWOVShY-py3.11/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mako6/Library/Caches/pypoetry/virtualenvs/ki-grundlagen-hdp-20

In [52]:
search.score(X_test,y_test)

0.986

In [53]:
search.predict(["This Machine Learning course was very good"])

array([1])

#### Fazit

Das Modell ist ganz gut geeignet für dieses spezielle Datenset, jedoch fehlen dem Modell das Grundverständnis für beispielsweise Verneinungen, Sarkasmus, Satzbau etc. Dies könnte mit erweiterten Methoden angepasst werden. 