In [8]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

import spacy
from typing import List #Weil daraum 
import unidecode #PreProcessing
import re #Regular Expressions PreProcessing
import string
from collections import Counter 

from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer #TFIDF
from sklearn.feature_extraction.text import CountVectorizer #Document Word Matrix
from sklearn.model_selection import train_test_split, GridSearchCV  #Split in Trainings und Test Datensatz 
from sklearn.naive_bayes import MultinomialNB #MULTINOMIAL NAIVE BAYES CLASSIFIER
from sklearn.naive_bayes import ComplementNB #COMPLETE NAIVE BAYES 
from sklearn.svm import SVC #SUPPORT VECTOR MACHINES
from sklearn.linear_model import LogisticRegression 

from sklearn import metrics 
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score #BEWERTUNG


In [9]:
#setup 

from datasets import load_dataset

nlp = spacy.load('en_core_web_sm')

try:
    from nltk.corpus import stopwords 
    
except: 
    import nltk 
    nltk.downlaod("stopwords")

finally:
    from nltk.corpus import stopwords 

In [10]:
dataset = load_dataset("sst2")
train_data, test_data = train_test_split(dataset['train'], test_size=0.2, random_state=42)


Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading and preparing dataset sst2/default to /home/michael/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset sst2 downloaded and prepared to /home/michael/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
def tokenizer(text):  
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_space]
    return tokens

In [12]:
#CLEANING

def convert_to_lowercase(text: str) -> str:
    return text.lower()

def regex(text: str) -> str:
   
    text = re.sub(r'\S*@\S*\s?', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\'", '', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w\b', '', text)
    hashtags = re.findall(r'#\w+', text)
    text += ' '.join(hashtags)   
    return text

def lemmatize(text: str) -> str:
    # Lemmatisierung mit SpaCy
    doc = nlp(text)
    lemmatized_words = []
    for token in doc:
        lemmatized_words.append(token.lemma_)
    
    return " ".join(lemmatized_words)


def remove_stopwords(text: str, sw: List[str] = stopwords.words("english")) -> str: 
    #Eventuell überarbeiten
    additional_sw = ["ubers","uber","drive","gt","get","got","go","ride","make","would","say","driver", "nt", "ca"]
    sw = sw + additional_sw
    text_list = text.split()
    text_list = [word for word in text_list if word.lower() not in sw]
    return " ".join(text_list)

def remove_punctuation(text: str, punct: str = string.punctuation) -> str:
   
    cleaned_text = "".join([char for char in text if char not in punct])
    return cleaned_text

def unicode(text: str) -> str:
    return unidecode.unidecode(text)

def clean(text: str) -> str:
    text = unicode(text)
    text = remove_punctuation(text)
    text = convert_to_lowercase(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    text = regex(text)
    return text

In [13]:
#Feature Repräsentation BoW

def bow(train_data, test_data, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(stop_words='english',ngram_range=ngram_range, tokenizer=tokenizer, preprocessor=clean)

    X_train = vectorizer.fit_transform(train_data['sentence'])
    Y_train = train_data['label']

    X_test = vectorizer.transform(test_data['sentence'])
    Y_test = test_data['label']

    return X_train, Y_train, X_test, Y_test

In [93]:
def tfidf(train_data, test_data, ngram_range=(1,1)):
    tfidf = TfidfVectorizer(ngram_range=ngram_range,preprocessor=clean)

    X_train = tfidf.fit_transform(train_data['sentence'])
    Y_train = train_data['label']

    X_test = tfidf.transform(test_data['sentence'])
    Y_test = test_data['label']

    return X_train, Y_train, X_test, Y_test

In [94]:
def mnb_tuning(X_train, Y_train, X_test, Y_test):
    # Multinomialer Naive Bayes Classifier
    MNB = MultinomialNB()

    # Hyperparameter für Grid Search
    parameters = {'alpha': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}

    # Grid Search mit 5-facher Kreuzvalidierung
    grid_search = GridSearchCV(MNB, parameters, cv=5, scoring="accuracy")
    grid_search.fit(X_train, Y_train)

    best_params = grid_search.best_params_  # Beste Hyperparameter-Kombination
    best_score = grid_search.best_score_  # Beste Bewertungsmetrik (Accuracy)
    best_model = grid_search.best_estimator_  # Bestes Modell

    # Modell auf den Trainingsdaten trainieren
    best_model.fit(X_train, Y_train)

    # Vorhersagen auf den Testdaten
    Y_pred = best_model.predict(X_test)

    # Ausgabe der Ergebnisse
    print("Beste Hyperparameter: ", best_params)
    print("Beste Genauigkeit: {:.2f}%".format(best_score * 100))

    # Metriken berechnen
    accuracy = accuracy_score(Y_test, Y_pred)
    report = classification_report(Y_test, Y_pred)

    print("Logistische Regression - Klassifikationsbericht:")
    print(report)

    return accuracy, report

In [95]:
def cnb_tuning(X_train, Y_train, X_test, Y_test):
    # Complete Naive Bayes Classifier
    CNB = ComplementNB()

    # Hyperparameter für Grid Search
    parameters = {'alpha': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]}

    # Grid Search mit 5-facher Kreuzvalidierung
    grid_search = GridSearchCV(CNB, parameters, cv=5, scoring="accuracy")
    grid_search.fit(X_train, Y_train)

    best_params = grid_search.best_params_  # Beste Hyperparameter-Kombination
    best_score = grid_search.best_score_  # Beste Bewertungsmetrik (Accuracy)
    best_model = grid_search.best_estimator_  # Bestes Modell

    # Modell auf den Trainingsdaten trainieren
    best_model.fit(X_train, Y_train)

    # Vorhersagen auf den Testdaten
    Y_pred = best_model.predict(X_test)

    # Ausgabe der Ergebnisse
    print("Beste Hyperparameter: ", best_params)
    print("Beste Genauigkeit: {:.2f}%".format(best_score * 100))
    print(classification_report(Y_test, Y_pred))

    # Metriken berechnen
    accuracy = accuracy_score(Y_test, Y_pred)
    report = classification_report(Y_test, Y_pred)

    print("CNB - Klassifikationsbericht:")
    print(report)
    
    return accuracy, report


In [14]:
def svc_tuning(X_train, Y_train, X_test, Y_test, ngram_range=(1, 1)):
    parameters = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
    model = GridSearchCV(SVC(), parameters, cv=5, scoring="accuracy")
    model.fit(X_train, Y_train)

    best_params = model.best_params_
    best_score = model.best_score_

    best_model = model.best_estimator_
    Y_pred = best_model.predict(X_test)

    print("Beste Hyperparameter: ", best_params)
    print("Beste Genauigkeit: {:.2f}%".format(best_score * 100))
    print(classification_report(Y_test, Y_pred))

    accuracy = accuracy_score(Y_test, Y_pred)
    report = classification_report(Y_test, Y_pred)

    print("svc - Klassifikationsbericht:")
    print(report)

    return accuracy, report

In [97]:
def logreg(X_train, Y_train, X_test, Y_test,max_iter=1000):
    model = LogisticRegression()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    report = classification_report(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    print("LogReg")
    print(report)
    return accuracy, report


In [98]:
def mnb(X_train, Y_train, X_test, Y_test):
    model = MultinomialNB()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    report = classification_report(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    print("MNB")
    print(report)
    return accuracy, report

In [99]:
def cnb(X_train, Y_train, X_test, Y_test):
    model = ComplementNB()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    report = classification_report(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    print("CNB")
    print(report)
    return accuracy, report

In [100]:
def svc(X_train, Y_train, X_test, Y_test):
    model = SVC()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    report = classification_report(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    print("SVC")
    print(report)
    return accuracy, report


In [101]:
#UNIGRAM Word2Vec
X_train, Y_train, X_test, Y_test = word_to_vec(train_data, test_data, ngram_range=(1, 1))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      5909
           1       0.88      0.91      0.90      7561

    accuracy                           0.88     13470
   macro avg       0.88      0.88      0.88     13470
weighted avg       0.88      0.88      0.88     13470

SVC
              precision    recall  f1-score   support

           0       0.91      0.86      0.89      5909
           1       0.90      0.93      0.92      7561

    accuracy                           0.90     13470
   macro avg       0.90      0.90      0.90     13470
weighted avg       0.90      0.90      0.90     13470

MNB
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      5909
           1       0.87      0.89      0.88      7561

    accuracy                           0.86     13470
   macro avg       0.86      0.86      0.86     13470
weighted avg       0.86      0.86      0.86     13470

CNB


(0.8614699331848552,
 '              precision    recall  f1-score   support\n\n           0       0.83      0.87      0.85      5909\n           1       0.89      0.86      0.87      7561\n\n    accuracy                           0.86     13470\n   macro avg       0.86      0.86      0.86     13470\nweighted avg       0.86      0.86      0.86     13470\n')

In [102]:
#UNIGRAM / BIGRAM  Word2Vec
X_train, Y_train, X_test, Y_test = word_to_vec(train_data, test_data, ngram_range=(1, 2))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5909
           1       0.91      0.93      0.92      7561

    accuracy                           0.91     13470
   macro avg       0.91      0.90      0.91     13470
weighted avg       0.91      0.91      0.91     13470

SVC
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      5909
           1       0.90      0.94      0.92      7561

    accuracy                           0.91     13470
   macro avg       0.91      0.90      0.90     13470
weighted avg       0.91      0.91      0.91     13470

MNB
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      5909
           1       0.90      0.90      0.90      7561

    accuracy                           0.89     13470
   macro avg       0.88      0.88      0.88     13470
weighted avg       0.89      0.89      0.89     13470

CNB


(0.8844840386043059,
 '              precision    recall  f1-score   support\n\n           0       0.85      0.90      0.87      5909\n           1       0.92      0.88      0.89      7561\n\n    accuracy                           0.88     13470\n   macro avg       0.88      0.89      0.88     13470\nweighted avg       0.89      0.88      0.88     13470\n')

In [103]:
#BIGRAM Word2Vec
X_train, Y_train, X_test, Y_test = word_to_vec(train_data, test_data, ngram_range=(2, 2))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg
              precision    recall  f1-score   support

           0       0.92      0.73      0.81      5909
           1       0.82      0.95      0.88      7561

    accuracy                           0.85     13470
   macro avg       0.87      0.84      0.85     13470
weighted avg       0.86      0.85      0.85     13470

SVC
              precision    recall  f1-score   support

           0       0.94      0.67      0.78      5909
           1       0.79      0.97      0.87      7561

    accuracy                           0.84     13470
   macro avg       0.86      0.82      0.83     13470
weighted avg       0.86      0.84      0.83     13470

MNB
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      5909
           1       0.82      0.91      0.86      7561

    accuracy                           0.84     13470
   macro avg       0.84      0.83      0.83     13470
weighted avg       0.84      0.84      0.84     13470

CNB


(0.8198960653303637,
 '              precision    recall  f1-score   support\n\n           0       0.73      0.92      0.82      5909\n           1       0.92      0.74      0.82      7561\n\n    accuracy                           0.82     13470\n   macro avg       0.83      0.83      0.82     13470\nweighted avg       0.84      0.82      0.82     13470\n')

In [104]:
#UNIGRAM TFIDF
X_train, Y_train, X_test, Y_test = tfidf(train_data, test_data, ngram_range=(1, 1))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

LogReg
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5909
           1       0.88      0.90      0.89      7561

    accuracy                           0.88     13470
   macro avg       0.88      0.87      0.88     13470
weighted avg       0.88      0.88      0.88     13470

SVC
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5909
           1       0.92      0.93      0.92      7561

    accuracy                           0.91     13470
   macro avg       0.91      0.91      0.91     13470
weighted avg       0.91      0.91      0.91     13470

MNB
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      5909
           1       0.87      0.91      0.89      7561

    accuracy                           0.87     13470
   macro avg       0.87      0.87      0.87     13470
weighted avg       0.87      0.87      0.87     13470

CNB


(0.873645137342242,
 '              precision    recall  f1-score   support\n\n           0       0.84      0.87      0.86      5909\n           1       0.90      0.87      0.89      7561\n\n    accuracy                           0.87     13470\n   macro avg       0.87      0.87      0.87     13470\nweighted avg       0.87      0.87      0.87     13470\n')

In [105]:
#UNIGRAM / BIGRAM TFIDF
X_train, Y_train, X_test, Y_test = tfidf(train_data, test_data, ngram_range=(1, 2))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      5909
           1       0.90      0.93      0.91      7561

    accuracy                           0.90     13470
   macro avg       0.90      0.90      0.90     13470
weighted avg       0.90      0.90      0.90     13470

SVC
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      5909
           1       0.92      0.93      0.93      7561

    accuracy                           0.92     13470
   macro avg       0.91      0.91      0.91     13470
weighted avg       0.92      0.92      0.92     13470

MNB
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5909
           1       0.91      0.92      0.91      7561

    accuracy                           0.90     13470
   macro avg       0.90      0.90      0.90     13470
weighted avg       0.90      0.90      0.90     13470

CNB


(0.9030438010393467,
 '              precision    recall  f1-score   support\n\n           0       0.88      0.91      0.89      5909\n           1       0.93      0.90      0.91      7561\n\n    accuracy                           0.90     13470\n   macro avg       0.90      0.90      0.90     13470\nweighted avg       0.90      0.90      0.90     13470\n')

In [106]:
#BIGRAM TFIDF
X_train, Y_train, X_test, Y_test = tfidf(train_data, test_data, ngram_range=(2, 2))
logreg(X_train, Y_train, X_test, Y_test)
svc(X_train, Y_train, X_test, Y_test)
mnb(X_train, Y_train, X_test, Y_test)
cnb(X_train, Y_train, X_test, Y_test)

LogReg
              precision    recall  f1-score   support

           0       0.92      0.74      0.82      5909
           1       0.82      0.95      0.88      7561

    accuracy                           0.86     13470
   macro avg       0.87      0.84      0.85     13470
weighted avg       0.86      0.86      0.85     13470

SVC
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      5909
           1       0.84      0.94      0.89      7561

    accuracy                           0.86     13470
   macro avg       0.87      0.85      0.86     13470
weighted avg       0.87      0.86      0.86     13470

MNB
              precision    recall  f1-score   support

           0       0.89      0.76      0.82      5909
           1       0.83      0.92      0.88      7561

    accuracy                           0.85     13470
   macro avg       0.86      0.84      0.85     13470
weighted avg       0.86      0.85      0.85     13470

CNB


(0.8415738678544915,
 '              precision    recall  f1-score   support\n\n           0       0.76      0.93      0.84      5909\n           1       0.93      0.77      0.85      7561\n\n    accuracy                           0.84     13470\n   macro avg       0.85      0.85      0.84     13470\nweighted avg       0.86      0.84      0.84     13470\n')

In [18]:
X_train, Y_train, X_test, Y_test = word_to_vec(train_data, test_data, ngram_range=(1, 2))
svc_tuning(X_train, Y_train, X_test, Y_test,)

AttributeError: 'csr_matrix' object has no attribute 'lower'