In [20]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np
import random
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV


In [None]:
df_positive = pd.read_pickle('daten/otto_amazon_variation_pairs.pkl', compression='gzip')
df_positive.head(2)

In [None]:
df_negative = pd.read_excel('daten/negative_Otto_amazon.xlsx')
df_negative.head(2)

In [45]:
def sentece_embeding(positive_examples_a, positive_examples_b, negative_examples_a, negative_examples_b, model):
    model = model
    
    positive_embeddings_a = model.encode(positive_examples_a, batch_size=32)
    positive_embeddings_b = model.encode(positive_examples_b, batch_size=32)
    positive_features = np.abs(positive_embeddings_a - positive_embeddings_b)
    
    negative_embeddings_a = model.encode(negative_examples_a, batch_size=32)
    negative_embeddings_b = model.encode(negative_examples_b, batch_size=32)
    negative_features = np.abs(negative_embeddings_a - negative_embeddings_b)
    
    return positive_features, negative_features

In [39]:
def featureXlabels(positive_features, negative_features, test_size):
    
    positive_labels = np.ones(len(positive_features))
    negative_labels = np.zeros(len(negative_features))
    
    X = np.concatenate([positive_features, negative_features], axis=0)
    y = np.concatenate([positive_labels, negative_labels], axis=0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    return X_train, X_test, y_train, y_test



In [56]:
pipeline = Pipeline([
    ('clf', DecisionTreeClassifier())  # Placeholder, will be replaced by GridSearchCV
])

In [57]:
models = [
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier()),
    ('svm', SVC()),
    ('sgd', SGDClassifier())
]


In [55]:
# Define the parameter grid
param_grid = [
    {
        'clf': [DecisionTreeClassifier()], 
        'clf__max_depth': [5, 11]
    },
    {
        'clf': [RandomForestClassifier()]
        # You can add parameters for RandomForestClassifier here if needed
    },
    {
        'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf']
    },
    {
        'clf': [SGDClassifier()],
        'clf__loss': ['hinge', 'log']
    }
]

In [43]:

Sentencetransformer_model = SentenceTransformer("sentenceT/ausgabe/all-MiniLM-L6-v2-similar_otto_amazon_products-2024-03-02/")

In [44]:
df_positive_a = df_positive['amazon_product_name'].to_list()
df_positive_b = df_positive['otto_variation_name'].to_list()
df_negative_a = df_negative['amazon_product_name'].to_list()
df_negative_b = df_negative['otto_variation_name'].to_list()

In [46]:
positive_features, negative_features = sentece_embeding(df_positive_a, df_positive_b, df_negative_a, df_negative_b, Sentencetransformer_model)

In [48]:
X_train, X_test, y_train, y_test = featureXlabels(positive_features, negative_features, test_size =0.2)

In [58]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END .....clf=DecisionTreeClassifier(), clf__max_depth=5; total time=  28.4s
[CV] END .....clf=DecisionTreeClassifier(), clf__max_depth=5; total time=  27.9s
[CV] END .....clf=DecisionTreeClassifier(), clf__max_depth=5; total time=  27.9s
[CV] END .....clf=DecisionTreeClassifier(), clf__max_depth=5; total time=  27.9s
[CV] END .....clf=DecisionTreeClassifier(), clf__max_depth=5; total time=  27.9s
[CV] END ....clf=DecisionTreeClassifier(), clf__max_depth=11; total time=  56.3s
[CV] END ....clf=DecisionTreeClassifier(), clf__max_depth=11; total time=  56.3s
[CV] END ....clf=DecisionTreeClassifier(), clf__max_depth=11; total time=  56.2s
[CV] END ....clf=DecisionTreeClassifier(), clf__max_depth=11; total time=  56.4s
[CV] END ....clf=DecisionTreeClassifier(), clf__max_depth=11; total time=  56.4s
[CV] END .......................clf=RandomForestClassifier(); total time=10.1min
[CV] END .......................clf=RandomForestC

In [None]:
print("Beste Parameter:", grid_search.best_params_)
print("Beste Genauigkeit:", grid_search.best_score_)
