Baseline Models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Thesis/

/content/drive/MyDrive/Thesis


In [None]:
import numpy as np
import pandas as pd
from pandas import read_excel
from sklearn.model_selection import train_test_split
import seaborn as sns
import re

In [None]:
df = read_excel("complete data after manual annotation.xlsx")

In [None]:
# For combining pedagogics and class management

df['Topics'] = df['Topics'].replace( 'klassenmanagment_klassenmanagement_unterrichtsstörung_unterrichtsstunde', 'pädagogischen_pädagogisch_pädagogische_diagnostisch')

In [None]:
# for combining task and concentration

df['Topics'] = df['Topics'].replace( 'konzentrieren_konzentriert_bearbeitungszeit_aufgabenbearbeitung', 'aufgabe_schreiben_musterlösung_thematik')

In [None]:
X = df[['Sentences']]
y = df['Topics']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, shuffle=True)

In [None]:
#if want to use data augmentation for train set
# Augmentation with pp and videos combined

USE_DATA_AUGMENTATION = True

if USE_DATA_AUGMENTATION:
  df_aug = read_excel("data_aug_using_duplicates_after_MA.xlsx")

  X_aug = df_aug[['Sentences']]
  y_aug = df_aug['Topics']

  X_train_new = pd.concat([X_train, X_aug], axis=0)
  y_train_new = pd.concat([y_train, y_aug], axis=0)

  X_train = X_train_new
  y_train = y_train_new


In [None]:
def clean_text(text):

    RE_PUNCTUATION = re.compile("([!?.,;-])")
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž,.!?0-9 ]", re.IGNORECASE)
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_PUNCTUATION, r" \1 ", text)
    text = re.sub(RE_WSPACE, " ", text)
    return text

# clean the original sentences column
X_train["Sentences"] = X_train["Sentences"].map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

X_test["Sentences"] = X_test["Sentences"].map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

X_train.head()

Unnamed: 0,Sentences
10489,Vor der Bearbeitung der Aufgaben habe ich habe...
5671,Ich habe zuerst das zugehörige Material gesich...
17240,Warum also ändert sich der Standard von Schule...
16595,So wies zum Beispiel ein Kommentar von Derek ...
19009,Dort wurde lediglich das Wissen als essentiell...


In [None]:
temp_train = list(X_train['Sentences'].values.astype('U'))
len(temp_train)
temp_test = list(X_test['Sentences'].values.astype('U'))

In [None]:
# Using tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(temp_train)
X_test_vectors_tfidf = tfidf_vectorizer.transform(temp_test)

X_train_vectors_tfidf.shape

(15758, 17303)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['linear','rbf']}
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)

In [None]:
grid.fit(X_train_vectors_tfidf, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.417 total time= 1.0min
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.406 total time= 1.0min
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.411 total time= 1.0min
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.410 total time= 1.0min
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.412 total time= 1.0min
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.296 total time= 1.4min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.290 total time= 1.4min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.296 total time= 1.5min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.294 total time= 1.4min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.287 total time= 1.4min
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.417 total time= 1.0min
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear

In [None]:
print(grid.best_params_)

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [None]:
# training svm classifier
from sklearn import svm
SVM = svm.SVC(C=10, kernel='rbf', gamma=0.1)
text_clf_svm = SVM.fit(X_train_vectors_tfidf, y_train)

In [None]:
# Testing classifier and checking accuracy
predicted_svm = text_clf_svm.predict(X_test_vectors_tfidf)
np.mean(predicted_svm == y_test)

0.7390862944162436

In [None]:
y_pred = text_clf_svm.predict(X_test_vectors_tfidf)

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7390862944162436


In [None]:
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             f1_score, cohen_kappa_score, confusion_matrix,
                             classification_report)

accuracy = accuracy_score(y_true=y_test, y_pred=predicted_svm)
recall = recall_score(y_true=y_test, y_pred=predicted_svm, average="weighted", zero_division=0)
precision = precision_score(y_true=y_test, y_pred=predicted_svm, average="weighted", zero_division=0)
f1_w = f1_score(y_true=y_test, y_pred=predicted_svm, average="weighted", zero_division=0)
f1_m = f1_score(y_true=y_test, y_pred=predicted_svm, average="macro", zero_division=0)
cohens_kappa = cohen_kappa_score(y_test, predicted_svm)
cmatrix = confusion_matrix(y_true=y_test, y_pred=predicted_svm)
clf_report = classification_report(y_true=y_test, y_pred=predicted_svm, zero_division=0)

In [None]:
print("Accuracy:", accuracy, "Recall:", recall, "Precision: ", precision, "F1 score-Weighted: ", f1_w, "F1 score-Macro: ", f1_m, "Cohens kappa: ", cohens_kappa)
print("Classification Report: \n",clf_report, "\nConfusion Matrix: \n", cmatrix)

Accuracy: 0.7390862944162436 Recall: 0.7390862944162436 Precision:  0.730060587013986 F1 score-Weighted:  0.7205842795221769 F1 score-Macro:  0.6305251994356826 Cohens kappa:  0.5652599518625154
Classification Report: 
                                                                                      precision    recall  f1-score   support

_interventionsmaßnahme_interventionsmaßnaher_interventionsaufgabe_präventionsansatz       0.79      0.50      0.61        22
             _portfoliobearbeitung_portfolioaufgabe_vorstrukturierung_portfolioteil       0.74      0.77      0.76        22
                      _powerpoint_powerpointpräsentation_powerpointfolie_gesamtbild       0.89      0.84      0.86        19
                                          _verständlich_reflexion_beispiel_anwenden       0.75      0.89      0.81      2229
                   _vorlesungsvideo_videomaterial_vorlesungsfolie_videoaufzeichnung       0.96      0.74      0.84        31
                             

Naive Bayes Classifier

In [None]:
# training naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
text_clf = nb.fit(X_train_vectors_tfidf, y_train)

In [None]:
y_pred = text_clf.predict(X_test_vectors_tfidf)

In [None]:
# Testing classifier and checking accuracy
predicted_nb = text_clf.predict(X_test_vectors_tfidf)
np.mean(predicted_nb == y_test)

0.5662436548223351

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5662436548223351


In [None]:
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             f1_score, cohen_kappa_score, confusion_matrix,
                             classification_report)

accuracy = accuracy_score(y_true=y_test, y_pred=predicted_nb)
recall = recall_score(y_true=y_test, y_pred=predicted_nb, average="weighted", zero_division=0)
precision = precision_score(y_true=y_test, y_pred=predicted_nb, average="weighted", zero_division=0)
f1_w = f1_score(y_true=y_test, y_pred=predicted_nb, average="weighted", zero_division=0)
f1_m = f1_score(y_true=y_test, y_pred=predicted_nb, average="macro", zero_division=0)
cohens_kappa = cohen_kappa_score(y_test, predicted_nb)
cmatrix = confusion_matrix(y_true=y_test, y_pred=predicted_nb)
clf_report = classification_report(y_true=y_test, y_pred=predicted_nb, zero_division=0)

In [None]:
print("Accuracy:", accuracy, "Recall:", recall, "Precision: ", precision, "F1 score-Weighted: ", f1_w, "F1 score-Macro: ", f1_m, "Cohens kappa: ", cohens_kappa)
print("Classification Report: \n",clf_report, "\nConfusion Matrix: \n", cmatrix)


Accuracy: 0.5662436548223351 Recall: 0.5662436548223351 Precision:  0.39988142751966527 F1 score-Weighted:  0.41143472498630734 F1 score-Macro:  0.04009783262130682 Cohens kappa:  0.004043752816789814
Classification Report: 
                                                                                      precision    recall  f1-score   support

_interventionsmaßnahme_interventionsmaßnaher_interventionsaufgabe_präventionsansatz       0.00      0.00      0.00        22
             _portfoliobearbeitung_portfolioaufgabe_vorstrukturierung_portfolioteil       0.00      0.00      0.00        22
                      _powerpoint_powerpointpräsentation_powerpointfolie_gesamtbild       0.00      0.00      0.00        19
                                          _verständlich_reflexion_beispiel_anwenden       0.57      1.00      0.72      2229
                   _vorlesungsvideo_videomaterial_vorlesungsfolie_videoaufzeichnung       0.00      0.00      0.00        31
                       

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
regr = LogisticRegression(C=30.0, solver='newton-cg', multi_class='multinomial', random_state=40)
text_clf_reg = regr.fit(X_train_vectors_tfidf, y_train)

In [None]:
y_pred = text_clf_reg.predict(X_test_vectors_tfidf)

In [None]:
# Testing classifier and checking accuracy
predicted_reg = text_clf_reg.predict(X_test_vectors_tfidf)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7208121827411168


In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=predicted_reg)
recall = recall_score(y_true=y_test, y_pred=predicted_reg, average="weighted", zero_division=0)
precision = precision_score(y_true=y_test, y_pred=predicted_reg, average="weighted", zero_division=0)
f1_w = f1_score(y_true=y_test, y_pred=predicted_reg, average="weighted", zero_division=0)
f1_m = f1_score(y_true=y_test, y_pred=predicted_reg, average="macro", zero_division=0)
cohens_kappa = cohen_kappa_score(y_test, predicted_reg)
cmatrix = confusion_matrix(y_true=y_test, y_pred=predicted_reg)
clf_report = classification_report(y_true=y_test, y_pred=predicted_reg, zero_division=0)

In [None]:
print("Accuracy:", accuracy, "Recall:", recall, "Precision: ", precision, "F1 score-Weighted: ", f1_w, "F1 score-Macro: ", f1_m, "Cohens kappa: ", cohens_kappa)
print("Classification Report: \n",clf_report, "\nConfusion Matrix: \n", cmatrix)


Accuracy: 0.7208121827411168 Recall: 0.7208121827411168 Precision:  0.7143444969830899 F1 score-Weighted:  0.704649178840528 F1 score-Macro:  0.5687417981914337 Cohens kappa:  0.5360631673440708
Classification Report: 
                                                                                      precision    recall  f1-score   support

_interventionsmaßnahme_interventionsmaßnaher_interventionsaufgabe_präventionsansatz       0.70      0.32      0.44        22
             _portfoliobearbeitung_portfolioaufgabe_vorstrukturierung_portfolioteil       0.71      0.45      0.56        22
                      _powerpoint_powerpointpräsentation_powerpointfolie_gesamtbild       0.92      0.63      0.75        19
                                          _verständlich_reflexion_beispiel_anwenden       0.74      0.88      0.81      2229
                   _vorlesungsvideo_videomaterial_vorlesungsfolie_videoaufzeichnung       1.00      0.52      0.68        31
                             