In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

df = pd.read_csv('dataset.csv')
df = shuffle(df, random_state=42)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,genre,period,region,text,title,year
260,260,LEGA,P5,WMD,"Neue Sammlung von Gesetzen, Statuten und Veror...",NeueSammlungvonGesetzenStatutenu.Verordnungenf...,1872
228,228,NEWS,P5,WOD,Bern. Die Unglücksstäte an der Stockern=Sandst...,Berner Intelligenzblatt,1869
31,31,LEGA,P6,OOD,Verordnungsblatt\r\nfür den Dienstbereich des\...,VerordnungsblattMinisteriumCultus,1902
583,583,NARR,P4,OMD,"Wer sich vorgenommen, Untergebene zu tadeln, w...",Blutrosen von Augsburg,1842
600,600,SCIE,P4,NoD,Einleitung\r\n\r\nDie Erscheinungen des Lebens...,physiologische Chemie,1844


In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=.2, random_state=42)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words

tf = TfidfVectorizer(stop_words=get_stop_words('de'), max_features=20000)

X_train = tf.fit_transform(df_train.text)
X_test = tf.transform(df_test.text)

In [5]:
y_train = df_train.genre.to_numpy()
y_test = df_test.genre.to_numpy()

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

naive_bayes = MultinomialNB().fit(X_train, y_train)

y_pred = naive_bayes.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.00      0.00      0.00        14
        HUMA       1.00      0.10      0.18        10
        LEGA       0.00      0.00      0.00        20
        NARR       1.00      0.12      0.22         8
        NEWS       0.41      1.00      0.58        48
        SCIE       1.00      0.14      0.24        22
        SERM       0.83      0.83      0.83         6

    accuracy                           0.45       128
   macro avg       0.61      0.31      0.29       128
weighted avg       0.51      0.45      0.33       128



  'precision', 'predicted', average, warn_for)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

log_reg = LogisticRegression().fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        DRAM       1.00      0.07      0.13        14
        HUMA       1.00      0.40      0.57        10
        LEGA       1.00      0.20      0.33        20
        NARR       0.50      0.38      0.43         8
        NEWS       0.48      1.00      0.65        48
        SCIE       1.00      0.32      0.48        22
        SERM       0.71      0.83      0.77         6

    accuracy                           0.56       128
   macro avg       0.81      0.46      0.48       128
weighted avg       0.76      0.56      0.50       128



In [8]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

lin_svm = LinearSVC(C=2, loss='hinge', penalty='l2').fit(X_train, y_train)

y_pred = lin_svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       1.00      1.00      1.00        14
        HUMA       0.90      0.90      0.90        10
        LEGA       1.00      0.90      0.95        20
        NARR       0.88      0.88      0.88         8
        NEWS       0.92      1.00      0.96        48
        SCIE       1.00      0.95      0.98        22
        SERM       1.00      0.83      0.91         6

    accuracy                           0.95       128
   macro avg       0.96      0.92      0.94       128
weighted avg       0.96      0.95      0.95       128



In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

lin_svm_sgd = SGDClassifier(loss='hinge',
                            penalty='l2').fit(X_train, y_train)

y_pred = lin_svm_sgd.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Needed {lin_svm_sgd.n_iter_} iterations for convergence')

              precision    recall  f1-score   support

        DRAM       1.00      0.93      0.96        14
        HUMA       0.82      0.90      0.86        10
        LEGA       1.00      0.90      0.95        20
        NARR       0.78      0.88      0.82         8
        NEWS       0.94      1.00      0.97        48
        SCIE       1.00      0.95      0.98        22
        SERM       1.00      0.83      0.91         6

    accuracy                           0.95       128
   macro avg       0.93      0.91      0.92       128
weighted avg       0.95      0.95      0.95       128

Needed 12 iterations for convergence


In [10]:
from sklearn.model_selection import cross_val_score

ten_fold_cv_results = cross_val_score(
    
    LinearSVC(),
    X_train,
    y_train,
    cv=10,
    scoring='f1_macro'
)
ten_fold_cv_results, np.mean(ten_fold_cv_results)

  'precision', 'predicted', average, warn_for)


(array([0.80351593, 0.79154813, 0.9173559 , 0.89574315, 0.82892366,
        0.90285374, 0.94076503, 0.82414966, 0.82121212, 0.83333333]),
 0.8559400653193083)

In [47]:
from sklearn.model_selection import GridSearchCV

lin_svm_sgd_params = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00001, 0.00001],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001]
}

lin_svm_sgd_gridseach = GridSearchCV(SGDClassifier(loss='hinge'),
                                     param_grid=lin_svm_sgd_params,
                                     scoring='f1_macro',
                                     n_jobs=-1,
                                     verbose=1)

lin_svm_sgd_gridseach.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 336 candidates, totalling 1008 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1008 out of 1008 | elapsed:  3.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     val...ion_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0.001, 0.0001, 1e-05, 1e-05],
                         'eta0': [0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06,
                                  1e-07],
                

In [48]:
lin_svm_sgd_gridseach.best_params_, lin_svm_sgd_gridseach.best_score_

({'alpha': 0.0001, 'eta0': 0.1, 'learning_rate': 'constant', 'penalty': 'l2'},
 0.8734023325967687)

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

dec_tree = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = dec_tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.67      0.46      0.55        13
        HUMA       0.70      0.47      0.56        15
        LEGA       0.67      0.62      0.64        13
        NARR       0.62      0.50      0.55        16
        NEWS       0.67      0.79      0.72        38
        SCIE       0.44      0.39      0.41        18
        SERM       0.61      0.93      0.74        15

    accuracy                           0.62       128
   macro avg       0.62      0.59      0.60       128
weighted avg       0.63      0.62      0.61       128



In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rand_forest = RandomForestClassifier().fit(X_train, y_train)

y_pred = rand_forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        DRAM       0.92      0.92      0.92        13
        HUMA       0.50      0.20      0.29        15
        LEGA       0.79      0.85      0.81        13
        NARR       0.91      0.62      0.74        16
        NEWS       0.58      1.00      0.73        38
        SCIE       0.83      0.28      0.42        18
        SERM       0.75      0.60      0.67        15

    accuracy                           0.69       128
   macro avg       0.75      0.64      0.65       128
weighted avg       0.72      0.69      0.66       128





In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rand_forest = RandomForestClassifier().fit(X_train, y_train)

y_pred = rand_forest.predict(X_test)
print(classification_report(y_test, y_pred))