In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
torch.cuda.is_available()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lazytext.supervised import LazyTextPredict
import re
import nltk

In [2]:
%%capture
!pip install lazytext
!pip install nltk

In [55]:
from functools import reduce

def kfold_evaluation(f1,f2,f3,f4,f5):
    file1 = f1
    file2 = f2
    file3 = f3
    file4 = f4
    file5 = f5
    
    names = file1["Model"].tolist()
    results_f1 = reduce(lambda a, b: a.add(b, fill_value=0), [file1["F1_Score"], file2["F1_Score"], file3["F1_Score"], file4["F1_Score"], file5["F1_Score"]]).tolist()
    results_pre = reduce(lambda a, b: a.add(b, fill_value=0), [file1["Precision"], file2["Precision"], file3["Precision"], file4["Precision"], file5["Precision"]]).tolist()
    results_rec = reduce(lambda a, b: a.add(b, fill_value=0), [file1["Recall"], file2["Recall"], file3["Recall"], file4["Recall"], file5["Recall"]]).tolist()

    results_divided_f1 = []
    for i in results_f1:
        
        res = i/5
        results_divided_f1.append(res)
        
    results_divided_pre = []
    for i in results_pre:
        
        res = i/5
        results_divided_pre.append(res)
        
    results_divided_rec = []
    for i in results_rec:
        
        res = i/5
        results_divided_rec.append(res)

    d = {'Model':names,'F1 Score':results_divided_f1 ,'Precision':results_divided_pre, 'Recall':results_divided_rec}
    overall_df = pd.DataFrame(d)
    return overall_df

def fold_values(fold_f, fold_number):
    folds = pd.DataFrame(columns=["Model", "F1_Score"])
    models = []
    f1s = []
    p = []
    r = []

    for i in kfold_df[f"model_fold_{fold_number}"]:
        name = i.get("name")
        f1 = i.get("custom_metric_score").get("f1").get("f1_weighted")
        pr = i.get("custom_metric_score").get("precision").get("precision_weighted")
        re = i.get("custom_metric_score").get("recall").get("recall_weighted")
        models.append(name)
        f1s.append(f1)
        p.append(pr)
        r.append(re)


    folds["Model"] = models
    folds["F1_Score"] = f1s
    folds["Precision"] = p
    folds["Recall"] = r
    return folds

from lazytext.supervised import LazyTextPredict
import numpy as np
# Custom metric
def my_custom_metric(y_true, y_pred):
    test_list = y_true
    # Compute Evaluation Metrics
    f = f1_multiclass(test_list, y_pred)
    p = p_multiclass(test_list, y_pred)
    r = recall_multiclass(test_list, y_pred)
    a = accuracy_score(test_list, y_pred)
    ba = balanced_accuracy_score(test_list, y_pred)
    prs = precision_recall_fscore_support(test_list, y_pred)
    m = matthews_corrcoef(test_list,y_pred)

    results = {}
    results["acc"] = a
    results["f1"] = f
    results["precision"] = p
    results["recall"] = r
    results["bal_acc"] = ba
    results["prfs"] = prs
    results["mcc"] = m
    return results

# WHK

In [57]:
#Try with KFOLD
# man kann auch Custom Metrics oder Custom Parameter definieren
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(5, shuffle=True, random_state=42) 

In [72]:
df = pd.read_csv("../WHK/Data.csv", sep=";")
fold = 0

for train, test in kf.split(df["Antwort"],df['Resultat']):  
    fold+=1
    print(f"Fold #{fold}")
    
    train_text = df['Antwort'][train]
    train_labels = df['Resultat'][train]
    test_text = df['Antwort'][test]
    test_labels = df['Resultat'][test]
    
    print(train_text)

Fold #1
0                                                   Nein
2                                                   Nein
3                                                   Nein
5                                                   Nein
8      Ich hatte dann halt einfach keine Kraft mehr u...
                             ...                        
126    Hm, konzentrieren konnte ich mich gut noch, ka...
127                                                 Nein
128    Nein, es war nicht immer so, sondern eher so w...
129                                                 Nein
131    Nein, ich hasse Schmerzen über alles, deswegen...
Name: Antwort, Length: 105, dtype: object
Fold #2
0                                                   Nein
1                                                   Nein
4                                                   Nein
5                                                   Nein
6                                               Ja schon
                             .

In [73]:
fold = 0
kfold_df = pd.DataFrame()
df = pd.read_csv("../WHK/Data.csv", sep=";")
df['Resultat'] = df['Resultat'].map({'Ja': 1, 'Nein': 0})

for train, test in kf.split(df["Antwort"],df['Resultat']):  
    fold+=1
    print(f"Fold #{fold}")
    
    train_text = df['Antwort'][train]
    train_labels = df['Resultat'][train]
    test_text = df['Antwort'][test]
    test_labels = df['Resultat'][test]
    
        # Tokenize the words
    train_text_clean = train_text.apply(nltk.word_tokenize)
    test_text_clean = test_text.apply(nltk.word_tokenize)

    # Remove stop words
    stop_words=set(nltk.corpus.stopwords.words("german"))
    train_text_clean = train_text_clean.apply(lambda x: [item for item in x if item not in stop_words])
    test_text_clean = test_text_clean.apply(lambda x: [item for item in x if item not in stop_words])

    # Remove numbers, punctuation and special characters (only keep words)
    regex = '[a-z]+'
    train_text_clean = train_text_clean.apply(lambda x: [item for item in x if re.match(regex, item)])
    test_text_clean = test_text_clean.apply(lambda x: [item for item in x if re.match(regex, item)])

    # Lemmatization
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    train_text_clean = train_text_clean.apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
    test_text_clean = test_text_clean.apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])

    # Join the words again to form sentences
    train_text_clean = train_text_clean.apply(lambda x: " ".join(x))
    test_text_clean = test_text_clean.apply(lambda x: " ".join(x))
    
    # Tfidf vectorization
    vectorizer = TfidfVectorizer()

    x_train = vectorizer.fit_transform(train_text)
    x_test = vectorizer.transform(test_text)
    y_train = train_labels
    y_test = test_labels

    lazy_text = LazyTextPredict(
        classification_type="binary",
        custom_metric=my_custom_metric
        )
    models = lazy_text.fit(x_train, x_test, y_train, y_test)
    kfold_df[f"model_fold_{fold}"] = models

Output()

Fold #1


Output()

Fold #2


Output()

Fold #3


Output()

Fold #4


Output()

Fold #5


In [74]:
kfold_df["model_fold_1"][0]

{'name': 'AdaBoostClassifier',
 'accuracy': 0.8518518518518519,
 'balanced_accuracy': 0.8055555555555556,
 'f1_score': 0.75,
 'custom_metric_score': {'acc': 0.8518518518518519,
  'f1': {'f1_micro': 0.8518518518518519,
   'f1_macro': 0.8223684210526316,
   'f1_weighted': 0.8464912280701754},
  'precision': {'precision_micro': 0.8518518518518519,
   'precision_macro': 0.8535714285714285,
   'precision_weighted': 0.8523809523809524},
  'recall': {'recall_micro': 0.8518518518518519,
   'recall_macro': 0.8055555555555556,
   'recall_weighted': 0.8518518518518519},
  'bal_acc': 0.8055555555555556,
  'prfs': (array([0.85      , 0.85714286]),
   array([0.94444444, 0.66666667]),
   array([0.89473684, 0.75      ]),
   array([18,  9])),
  'mcc': 0.6573757351339166},
 'time': 0.09433794021606445,
 'model': AdaBoostClassifier(),
 'confusion_matrix': array([[17,  1],
        [ 3,  6]]),
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.85      0

In [75]:
fold1 = fold_values(kfold_df["model_fold_1"],1)
fold2 = fold_values(kfold_df["model_fold_2"],2)
fold3 = fold_values(kfold_df["model_fold_3"],3)
fold4 = fold_values(kfold_df["model_fold_4"],4)
fold5 = fold_values(kfold_df["model_fold_5"],5)

In [76]:
results = kfold_evaluation(fold1,fold2,fold3,fold4,fold5)
results.sort_values("F1 Score", ascending=False)

Unnamed: 0,Model,F1 Score,Precision,Recall
20,RandomForestClassifier,0.925732,0.934126,0.924501
12,LogisticRegression,0.917622,0.928908,0.916809
13,LogisticRegressionCV,0.909842,0.919046,0.908832
21,RidgeClassifier,0.909681,0.920791,0.908832
3,CalibratedClassifierCV,0.909681,0.920791,0.908832
4,ComplementNB,0.903986,0.924539,0.901994
11,LinearSVC,0.902489,0.915336,0.901425
19,Perceptron,0.902034,0.910499,0.90114
23,SVC,0.895945,0.91674,0.894302
8,ExtraTreesClassifier,0.895605,0.905095,0.894302


# MA

In [1]:
import pandas as pd
import json
import s3fs
from functools import reduce
#https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
from ipynb.fs.full.eval_metrics import *
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
torch.cuda.is_available()

True

In [2]:
# Opening JSON file
with open('../credentials.json', 'r') as openfile:
 
    # Reading from json file
    json_object = json.load(openfile)
    key = json_object["key"]
    secret = json_object["secret_key"]
    bucket_name = json_object["bucket_name"]

s3 = s3fs.S3FileSystem(anon=False,key=key,secret=secret)

In [28]:
train_test_number = [1,2,3,4,5]
train_file_name = "train.csv"
test_file_name = "test.csv"
kfold_df = pd.DataFrame()

for train_index in train_test_number:
    print(f"Fold Number: {train_index}")
    # Read data
    with s3.open(f"{bucket_name}/KFOLD{train_index}/{train_file_name}",'r') as file:
        data = pd.read_csv(file)
    with s3.open(f"{bucket_name}/KFOLD{train_index}/{test_file_name}",'r') as file:
        test_data = pd.read_csv(file)

    train_text = data['Text']
    train_labels = data['majority_vote']
    test_text = test_data['Text']
    test_labels = test_data['majority_vote']
    
        # Tokenize the words
    train_text_clean = train_text.apply(nltk.word_tokenize)
    test_text_clean = test_text.apply(nltk.word_tokenize)

    # Remove stop words
    stop_words=set(nltk.corpus.stopwords.words("german"))
    train_text_clean = train_text_clean.apply(lambda x: [item for item in x if item not in stop_words])
    test_text_clean = test_text_clean.apply(lambda x: [item for item in x if item not in stop_words])

    # Remove numbers, punctuation and special characters (only keep words)
    regex = '[a-z]+'
    train_text_clean = train_text_clean.apply(lambda x: [item for item in x if re.match(regex, item)])
    test_text_clean = test_text_clean.apply(lambda x: [item for item in x if re.match(regex, item)])

    # Lemmatization
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    train_text_clean = train_text_clean.apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])
    test_text_clean = test_text_clean.apply(lambda x: [lem.lemmatize(item, pos='v') for item in x])

    # Join the words again to form sentences
    train_text_clean = train_text_clean.apply(lambda x: " ".join(x))
    test_text_clean = test_text_clean.apply(lambda x: " ".join(x))
    
    # Tfidf vectorization
    vectorizer = TfidfVectorizer()

    x_train = vectorizer.fit_transform(train_text)
    x_test = vectorizer.transform(test_text) #hier mit oder ohne fit?
    y_train = train_labels
    y_test = test_labels

    lazy_text = LazyTextPredict(
        classification_type="binary",
        custom_metric=my_custom_metric
        )
    models = lazy_text.fit(x_train, x_test, y_train, y_test)
    kfold_df[f"model_fold_{train_index}"] = models

Fold Number: 1


Output()

Fold Number: 2


Output()

Fold Number: 3


Output()

Fold Number: 4


Output()

Fold Number: 5


Output()

In [56]:
fold1 = fold_values(kfold_df["model_fold_1"],1)
fold2 = fold_values(kfold_df["model_fold_2"],2)
fold3 = fold_values(kfold_df["model_fold_3"],3)
fold4 = fold_values(kfold_df["model_fold_4"],4)
fold5 = fold_values(kfold_df["model_fold_5"],5)

results = kfold_evaluation(fold1,fold2,fold3,fold4,fold5)
results.sort_values("F1 Score", ascending=False)

Unnamed: 0,Model,F1 Score,Precision,Recall
18,PassiveAggressiveClassifier,0.715325,0.712826,0.725418
3,CalibratedClassifierCV,0.710047,0.70809,0.721936
13,LogisticRegressionCV,0.704803,0.706931,0.724534
14,MLPClassifier,0.703967,0.701386,0.717622
22,SGDClassifier,0.70319,0.699756,0.712442
19,Perceptron,0.702857,0.699447,0.711591
11,LinearSVC,0.701483,0.700334,0.718492
21,RidgeClassifier,0.694784,0.701384,0.722809
16,NearestCentroid,0.682714,0.735938,0.671033
17,NuSVC,0.675703,0.691322,0.715902
