In [4]:
import pandas as pd
import spacy
from collections import Counter
import pandas as pd 
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### AutoTrain

In [5]:
class trainMan:
    def __init__(self, path, 
                 max_depthVal=None, max_depthMax=200, max_depthMin=0, 
                 min_sample_splitVal = 2, min_sample_splitMax=10, min_sample_splitMin=2, 
                 clf=None, criterionVal="entropy"):
        self.path = path
        self.trainList = self.__data()
        self.max_depthVal = max_depthVal
        self.max_depthMax = max_depthMax
        self.max_depthMin=max_depthMin
        self.min_sample_splitMax = min_sample_splitMax
        self.min_sample_splitMin = min_sample_splitMin
        self.min_sample_splitVal = min_sample_splitVal
        self.clf = clf
        self.criterionVal = criterionVal
        self.results = np.empty((3, max_depthMax-max_depthMin, min_sample_splitMax-min_sample_splitMin, 3))
    def __data(self):
        self.data = pd.read_csv(self.path)
        columnsLength = self.data.shape[1]-1
        X = self.data.iloc[:,0:columnsLength]
        y = self.data[["group"]].values.ravel()
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        return [self.X_train, self.X_test, self.y_train, self.y_test]
    def get_X_train(self):
        """Gibt X_train zurück."""
        return self.X_train
    def out(self):
        if(self.clf == None):
            print("Modell ohne Parameter wird angelegt, da keines vorhanden:")
            self.clf = DecisionTreeClassifier(random_state=42)
            self.clf = self.clf.fit(self.trainList[0], self.trainList[2])
        predict = self.clf.predict(self.trainList[1])
        report = classification_report(self.trainList[3], predict, target_names=["Atheism", "Religion"])
        print(report)
        print("F-Score: ",f1_score(predict, self.trainList[3], average='macro'))
        print("Precision: ",precision_score(predict, self.trainList[3], average='macro'))
        print("Recall: ",recall_score(predict, self.trainList[3], average='macro'))
    def max_depth(self, min, max):
        self.max_depthMax = max
        self.max_depthMin = min
        f = 0
        for i in range(self.max_depthMax):
            if(i<self.max_depthMin):
                continue
            clf = DecisionTreeClassifier(random_state=42, criterion=self.criterionVal, max_depth=i, min_samples_split=self.min_sample_splitVal)
            clf = clf.fit(self.trainList[0], self.trainList[2])
            predict = clf.predict(self.trainList[1])
            if f1_score(predict, self.trainList[3], average='macro')>f:
                f = f1_score(predict, self.trainList[3], average='macro')
                self.max_depthVal = i
                self.clf =  DecisionTreeClassifier(random_state=42, criterion=self.criterionVal, max_depth=i, min_samples_split=self.min_sample_splitVal)
                self.clf = self.clf.fit(self.trainList[0], self.trainList[2])      
        return self.max_depthVal

### Laden Preprocess dateien

In [6]:
data_train = pd.read_csv("preprocessed/dataLemmaLowerStop_train.csv", on_bad_lines='skip', sep=';')
data_test = pd.read_csv("preprocessed/dataLemmaLowerStop_test.csv", on_bad_lines='skip', sep=';')

X_train = data_train['text']
y_train = data_train['group']
X_test =  data_test['text']
y_test = data_test['group']


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(2624,)
(2624,)
(656,)
(656,)


### Aufteilung data nach Zielklassen

In [7]:
# Trainingsdaten nach Zielklassen aufteilen und in X, y zerlegen
dataAth_train = data_train[data_train["group"] == 0]
X_Ath_train, y_Ath_train = dataAth_train['text'], dataAth_train['group']

dataGraphics_train = data_train[data_train["group"] == 1]
X_Graphics_train, y_Graphics_train = dataGraphics_train['text'], dataGraphics_train['group']

dataSpace_train = data_train[data_train["group"] == 2]
X_Space_train, y_Space_train = dataSpace_train['text'], dataSpace_train['group']

dataReli_train = data_train[data_train["group"] == 3]
X_Reli_train, y_Reli_train = dataReli_train['text'], dataReli_train['group']

# grouped_data_train enthält die DataFrames für jede Gruppe
grouped_data_train = [dataAth_train, dataGraphics_train, dataSpace_train, dataReli_train]

# Testdaten nach Zielklassen aufteilen und in X, y zerlegen
dataAth_test = data_test[data_test["group"] == 0]
X_Ath_test, y_Ath_test = dataAth_test['text'], dataAth_test['group']

dataGraphics_test = data_test[data_test["group"] == 1]
X_Graphics_test, y_Graphics_test = dataGraphics_test['text'], dataGraphics_test['group']

dataSpace_test = data_test[data_test["group"] == 2]
X_Space_test, y_Space_test = dataSpace_test['text'], dataSpace_test['group']

dataReli_test = data_test[data_test["group"] == 3]
X_Reli_test, y_Reli_test = dataReli_test['text'], dataReli_test['group']

# grouped_data_test enthält die DataFrames für jede Gruppe (Testdaten)
grouped_data_test = [dataAth_test, dataGraphics_test, dataSpace_test, dataReli_test]




### Option 1: Liste Wörter welche einen bestimmtem threshold haben erzeugen (für Weiterverarbietung)

Problematik:
- Diee max_tfidf_scores enhält jeweils den maximalen TF-IDF-Wert eines Wortes spiegelt nur seine Relevanz in einem einzigen Dokument wider, ohne zu berücksichtigen, ob das Wort in anderen Dokumenten relevant ist oder zur Klassifikation beiträgt.
=> Lösungs IDEE: Klassen trennen und den Durchschnitsswert nehmen, von den Documenten in denen der TFIDF-Wert nicht null ist

In [None]:
threshold = 0.4

# Funktion zur Berechnung der relevanten Wörter basierend auf TF-IDF
def get_top_words(X, threshold):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(X)

    features = vectorizer.get_feature_names_out()
    
    max_tfidf_scores = tfidf_matrix.max(axis=0).toarray().flatten()

    top_words = [features[i] for i in range(len(features)) if max_tfidf_scores[i] > threshold]

    return list(set(top_words)), max_tfidf_scores

top_words_train, max_tfidf_scores_train = get_top_words(X_train, threshold)


print("Relevante Wörter für Trainingsdaten:\n", top_words_train)
print("\nLänge (Train):\n", len(top_words_train))
print("\ntfidf_scores (Train):\n", max_tfidf_scores_train)



### Speichrn TFIDF-Matrix in csv
### Optional

In [None]:
#fidf_matrix_df.to_csv("tfidf_matrix.csv", index=False)
#print("TF-IDF-Matrix wurde gespeichert.")
#tfidf_matrix_df.head

### Option 2: Liste n-Wörter pro Kategorie mit dem höchsten tfidf Wert erzeugen (für Weiterverarbietung)

Weitere Ansätze 
- Ansatz mit avg_tfidf_scores ausprobieren

In [None]:
def get_top_n_words_per_category(n=200, grouped_data=None):
    # Initialisiere den TfidfVectorizer
    #vectorizer = TfidfVectorizer(min_df=1)
    vectorizer = TfidfVectorizer(
         max_df=0.8,
         min_df=2,
         ngram_range=(1, 2), #TODO Anpassen für kontext => bisher (1,1) bestes Ergebnis
         stop_words='english',
         max_features=5000
    )

    top_words_total = []
    top_words_by_group = {}

    # Iteriere durch jedes DataFrame in der gruppierten Datenliste
    for group_data in grouped_data:
        if 'text' not in group_data.columns:
            print("Textspalte fehlt in den Daten.")
            continue
        
        group_tfidf_matrix = vectorizer.fit_transform(group_data['text'])  # Berechne TF-IDF-Matrix

        features = vectorizer.get_feature_names_out()
        
        max_tfidf_scores = group_tfidf_matrix.max(axis=0).toarray().flatten()

        top_n_indices = max_tfidf_scores.argsort()[-n:][::-1]  # Indices der n höchsten Scores
        
        top_words_group = [features[i] for i in top_n_indices]
        top_words_by_group[group_data['group'].iloc[0]] = top_words_group  # Speichere die Top-Wörter pro Gruppe

        top_words_total.extend(top_words_group)

    top_words_total = list(set(top_words_total))

    return top_words_by_group, top_words_total

### Abspeichern Wörter-Header in CSV (test und train seperat)

In [None]:
top_words_by_group_train, top_words_total_train = get_top_n_words_per_category(200, grouped_data_train)

print("Relevante Wörter für Trainingsdaten:\n", top_words_total_train)
print("\nLänge (Train):\n", len(top_words_total_train))

#Abspeicher _train Datei
freqList_train = top_words_total_train+["group"]
freqListData_train = pd.DataFrame(columns=freqList_train)
freqListData_train.to_csv("csv/tfidf_mostCommon/20newsgroups_tfidf_200_per_category_train.csv", index=False)
freqListData_train.to_csv("csv/tfidf_mostCommon/20newsgroups_tfidf_200_per_category_test.csv", index=False)
freqListData_train.head()


Relevante Wörter für Trainingsdaten:
 ['mechanism', 'centaur', 'mithras', 'time', 'cjf', 'remainder', 'comp', 'graphig', 'version', 'baptize', 'jpg', 'stop', 'motivate', 'abekas', 'xv', 'love', 'breathe', 'pertain', 'car', 'left', 'exhibit', 'stan', 'seven', 'minivas', 'rich', 'renderman', 'paintbrush', 'tyour', 'graphics', 'ascii', 'tup', 'povray', 'pay', 'interrupt', 'camel', 'vmode', 'distance', 'methodology', 'newsgroup', 'sco', 'vpic60', 'servant', 'arm', 'hole', 'hillary', 'prince', 'piper', 'bhagwans', 'wrong', 'group', 'invasion', 'godless', 'ultrix', 'revelation', 'sure', 'positive', 'compile', 'tim', 'wow', 'respect', 'surrender', 'imagine', 'magi', 'weight', 'think', 'ezekiel', 'sei', 'cview', 'want', 'apparent', 'file', 'temperature', 'lucifer', 'color', 'depression', 'ksu', 'gravitational', 'raoul', 'david', 'tto', 'magellan', 'taoism', 'buggy', 'lewb', 'coke', 'usenet', 'rw', 'weitek', 'edge', 'chuck', 'groothuis', 'yo', 'theism', 'trail', 'elf', 'americans', 'presentatio

Unnamed: 0,mechanism,centaur,mithras,time,cjf,remainder,comp,graphig,version,baptize,...,print,delete,tiger,ics,arguer,envelope,irony,conform,caste,group


### Wörter Zählen alle (Nur klein schreibung)

In [None]:
nlp = spacy.load('en_core_web_sm')

# Funktion zur Berechnung der Wortfrequenzen und Speicherung in einer CSV
def calculate_and_save_word_frequencies(data, freqList, filename):
    freqListData_list = []

    for index, row in data.iterrows():
        text = str(row['text'])
        group = str(row['group'])
        doc = nlp(text)

        count_in_freqList = Counter(token.text.lower() for token in doc)

        haeufigkeiten = {word: count_in_freqList.get(word.lower(), 0) for word in freqList}
        haeufigkeiten['group'] = group

        freqListData_list.append(haeufigkeiten)

    freqListData = pd.DataFrame(freqListData_list)

    freqListData.to_csv(filename, index=False)

    return freqListData



train_data_file = "csv/tfidf_mostCommon/20newsgroups_tfidf_200_per_category_train.csv"
freqListData_train = calculate_and_save_word_frequencies(data_train, freqList_train, train_data_file)

test_data_file = "csv/tfidf_mostCommon/20newsgroups_tfidf_200_per_category_test.csv"
freqListData_test = calculate_and_save_word_frequencies(data_test, freqList_train, test_data_file)

print("Train-Daten Häufigkeiten:\n", freqListData_train)
print("\nTest-Daten Häufigkeiten:\n", freqListData_test)

Train-Daten Häufigkeiten:
       mechanism  centaur  mithras  time  cjf  remainder  comp  graphig  \
0             0        0        0     0    0          0     0        0   
1             0        0        0     0    0          0     0        0   
2             0        0        0     0    0          0     0        0   
3             0        0        0     0    0          0     0        0   
4             0        0        0     2    0          0     0        0   
...         ...      ...      ...   ...  ...        ...   ...      ...   
2619          0        0        0     0    0          0     0        0   
2620          0        0        0     0    0          0     0        0   
2621          0        0        0     1    0          0     0        0   
2622          0        0        0     0    0          0     0        0   
2623          0        0        0     0    0          0     0        0   

      version  baptize  ...  moment  print  delete  tiger  ics  arguer  \
0         

### CLF Trainieren

In [None]:
X_train = freqListData_train.drop("group", axis=1)  # Features
y_train = freqListData_train[["group"]].values.ravel()  # Zielwerte
X_test = freqListData_test.drop("group", axis=1)  # Features
y_test = freqListData_test[["group"]].values.ravel()  # Zielwerte


clf = DecisionTreeClassifier(
    random_state=42, 
    criterion="entropy", 
    max_depth=130
)

clf = clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print("Macro Averaging:")
print("F-Score: ", f1_score(y_test, y_predict, average="macro"))
print("Precision: ", precision_score(y_test, y_predict, average="macro"))
print("Recall: ", recall_score(y_test, y_predict, average="macro"))

print("\nWeighted Averaging:")
print("F-Score: ", f1_score(y_test, y_predict, average="weighted"))
print("Precision: ", precision_score(y_test, y_predict, average="weighted"))
print("Recall: ", recall_score(y_test, y_predict, average="weighted"))

report = classification_report(y_test, y_predict, target_names=["atheism", "graphics", "space", "religion"])
print("\n"+report)


Macro Averaging:
F-Score:  0.6002000920941299
Precision:  0.6073401235468071
Recall:  0.5986424672718854

Weighted Averaging:
F-Score:  0.6118819717656845
Precision:  0.6170700317347014
Recall:  0.6128048780487805

              precision    recall  f1-score   support

     atheism       0.59      0.50      0.54       155
    graphics       0.74      0.67      0.71       190
       space       0.58      0.71      0.64       190
    religion       0.53      0.51      0.52       121

    accuracy                           0.61       656
   macro avg       0.61      0.60      0.60       656
weighted avg       0.62      0.61      0.61       656

