In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import spacy
from spacy.lang.en import English

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
import joblib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from ast import literal_eval

In [42]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [43]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data_train["text"])

X_train, X_test, y_train, y_test = train_test_split(features, data_train["classification"], test_size=0.2, random_state=42)

model = svm.SVC(probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Scientific       1.00      0.98      0.99        99
        news       0.92      1.00      0.96        98
     reviews       0.98      0.94      0.96       116
       story       1.00      0.98      0.99        87

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400



In [44]:
new_text = data_test["text"][12]
new_text_features = vectorizer.transform([new_text])
probabilities = model.predict_proba(new_text_features)
true_class = data_test["classification"][12]
predicted_class = model.predict(new_text_features)

# Wahrscheinlichkeiten und vorhergesagte Klasse ausgeben
for i, probs in enumerate(probabilities):
    class_probabilities = ["{:.2f}%".format(prob * 100) for prob in probs]
    print("Klasse {}: {}".format(predicted_class, class_probabilities))
print("Vorhergesagte Klasse:", predicted_class, " | Wahre Klasse:", true_class)

Klasse ['news']: ['0.66%', '96.92%', '1.84%', '0.58%']
Vorhergesagte Klasse: ['news']  | Wahre Klasse: news


In [48]:
data_train = pd.read_csv("../../data/data_with_features/data_train_with_features.csv").drop(["Unnamed: 0"], axis=1)
data_test = pd.read_csv("../../data/data_with_features/data_test_with_features.csv").drop(["Unnamed: 0"], axis=1)

data_train = data_train.sample(frac=1).reset_index(drop=True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

In [49]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(data_train["text"])

joblib.dump(vectorizer, "../../models/classification/vectorizer_1.joblib")

X_train, X_test, y_train, y_test = train_test_split(features, data_train["classification"], test_size=0.2, random_state=42)

In [50]:
from keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.layers import Embedding

In [51]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(features.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [52]:
label_to_int = {label: i for i, label in enumerate(np.unique(data_train["classification"]))}
y_train = np.array([label_to_int[label] for label in y_train])
y_test = np.array([label_to_int[label] for label in y_test])

In [53]:
model.fit(X_train.toarray(), y_train, epochs=10, batch_size=16, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [33]:
loss, accuracy = model.evaluate(X_test.toarray(), y_test, verbose=1)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.012417732737958431
Test Accuracy: 0.9975000023841858


In [60]:
test_number = 78
new_text = data_test["text"][test_number]
new_class = data_test["classification"][test_number]
new_text_features = vectorizer.transform([new_text])

predictions = model.predict(new_text_features.toarray())
predicted_class = np.argmax(predictions, axis=1)
predicted_probability = np.max(predictions, axis=1)

int_to_label = {i: label for label, i in label_to_int.items()}

predicted_labels = [int_to_label[prediction] for prediction in predicted_class]
for label, probability in zip(predicted_labels, predicted_probability):
    print(f"Vorhergesagte Klasse: {label}, Wahrscheinlichkeit: {probability}, Wahre Klasse: {new_class}. ", predicted_class)

Vorhergesagte Klasse: reviews, Wahrscheinlichkeit: 0.9768052101135254, Wahre Klasse: reviews.  [2]


In [166]:
# mit Allem: 99% Scientific
# ohne Anhang & Vezeichnisse: 99% Scientific

In [37]:
model.save("../../models/classification/neuro_net_1.h5")

In [64]:
data_test["text"][12]

'After years of searching, which ended with a 400 or so page photocopied version with missing pages, I found this book. This book is the only in depth and detailed english language record of almost all vegetation on Okinawa and the Ryukyu islands that I know of. The information within is in both Japanese (an older style) and English. I know of no other place that this book exists aside from the post library on Torii Station.'

In [65]:
data_test["classification"][12]

'reviews'