In [1]:
# !!! This section explains why we should not compare two sentences with these methods. !!! 
# It is not efficient to compare two sentences in Turkish.
# Word count based cosine similarity

import numpy as np

def sentence_to_word_dict(sentence):
    """
    This function takes a sentence as input and returns a dictionary with words as keys and their counts as values.
    
    Args:
        sentence: A string.
    
    Returns:
        A dictionary.
    """
    words = sentence.split()
    word_dict = {}
    for word in words:
        if word in word_dict:
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

sentence_1 = "C'nin dikkat ve özen yükümlülüğüne aykırı davranmış olması nedeniyle kusurlu olduğu değerlendirilemez."
sentence_2 = "C'nin dikkat ve özen yükümlülüğüne aykırı davranmış olması nedeniyle kusurlu olduğu değerlendirilebilir."

dict_1 = sentence_to_word_dict(sentence_1)
dict_2 = sentence_to_word_dict(sentence_2)

word_space = np.unique(list(dict_1.keys()) + list(dict_2.keys()))

# One-hot encoding
binary_vector_1 = [1 if word in dict_1 else 0 for word in word_space]
binary_vector_2 = [1 if word in dict_2 else 0 for word in word_space]

print(binary_vector_1)
print(binary_vector_2)

cosine_similarity = np.dot(binary_vector_1, binary_vector_2) / (np.linalg.norm(binary_vector_1) * np.linalg.norm(binary_vector_2))
print("Cosine similarity (%) :", cosine_similarity * 100)


# TF-IDF based cosine similarity
# Not an efficient way
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = [sentence_1,sentence_2]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
print(similarity)


# Levenshtein distance
def levenshtein_distance(s1, s2):
    len_s1, len_s2 = len(s1) + 1, len(s2) + 1
    dp = np.zeros((len_s1, len_s2))
    for i in range(len_s1):
        dp[i][0] = i
    for j in range(len_s2):
        dp[0][j] = j

    for i in range(1, len_s1):
        for j in range(1, len_s2):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)

    return dp[-1][-1]


print(levenshtein_distance(sentence_1, sentence_2))

[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]
Cosine similarity (%) : 91.66666666666669
0.8477624970048978
5.0


In [4]:
# Load the model Turkish word2vec model
from gensim.models import Word2Vec
from tabulate import tabulate

model = Word2Vec.load("utils/word2vec/w2v_.model")
print(tabulate(model.wv.most_similar("mahkeme"), headers=["Kelime", "Benzerlik Skoru"]))
print("\nWord Vector:", model.wv.get_vector("umut"))

Kelime        Benzerlik Skoru
----------  -----------------
Mahkeme              0.860295
mahkemenin           0.813442
davanın              0.806494
tutuklama            0.799902
soruşturma           0.791518
temyiz               0.771838
mahkemede            0.771153
dava                 0.770335
yargılama            0.769724
savcılık             0.730116

Word Vector: [ 0.1495104  -1.4914255  -0.50925356 -0.9685314   2.1551907   0.10626572
  0.4027821   1.0281931   0.41044936 -1.1525857  -0.0205108   1.0924134
 -1.9218051   1.3797586  -0.63527036 -0.38006008 -0.6512365  -0.96633595
  1.1853794   0.7896848  -0.03258616  0.8834496  -1.6903982   0.9449919
  0.6057014   0.59224516 -1.0036951   2.0536163  -2.1637177  -0.65654767
  1.0522053   0.11371119  1.1112392  -0.43076926  0.13155091 -1.1467836
 -0.8198967   1.1959015  -0.5887494  -1.0079744  -0.25314665  0.5018188
 -0.76072204 -0.30214065 -0.13227591  0.6748753   0.7053673   1.5428567
 -0.08245109  0.76109725 -0.6433578  -1.2249595

In [6]:
from sentence_similarity_comperators import SentenceComparator_Word2Vec,\
                                            SentenceComparator_Ollama,\
                                            SentenceComparator_semantic,\
                                            SentenceComparator_bert_cosine,\
                                            SentenceComparator_SBERT,\
                                            SentenceComparator_NLI,\
                                            SentenceComparator_sentiment_analysis

  from tqdm.autonotebook import tqdm, trange





In [10]:
import pandas as pd
import os

def create_log(log_name, additional_info=""):
    log_name = "log/" + log_name
    with open(log_name, 'w') as f:
        f.write(f"Log file created. ({log_name})\nAdditional Info: {additional_info}\n")

def append_to_log(log_name, message):
    log_name = "log/" + log_name
    with open(log_name, 'a') as f:
        f.write("\n" + message)

def create_excel_file(file_name, sheet_name, data):    
    file_name = "log/" + file_name
    # if exist, remove the file
    try:
        os.remove(file_name)
    except OSError:
        pass
    
    df = pd.DataFrame(data)
    df.to_excel(file_name, sheet_name=sheet_name, index=False)

def append_to_excel(file_name, sheet_name, data):
    file_name = "log/" + file_name
    # Data is one row
    excel_df = pd.read_excel(file_name, sheet_name=sheet_name)
    excel_df = pd.concat([excel_df, pd.DataFrame([data])], ignore_index=True)
    excel_df.to_excel(file_name, sheet_name=sheet_name, index=False)
    
def excel_to_df(file_name, sheet_name):
    file_name = "log/" + file_name
    excel_df = pd.read_excel(file_name, sheet_name=sheet_name)
    return excel_df

        

In [9]:
import time

def test_model(model, model_name):
    """
    This function tests the given model with the given name.

    Args:
        model(SentenceComparator): A SentenceComparator object.
        model_name(str): A string.
    """

    # Start the timer
    start = time.time()
    computation_count = 0

    # Define the sentences to compare
    test_sentences = [
        "C'nin dikkat ve özen yükümlülüğüne aykırı davranmış olması nedeniyle kusurlu olduğu değerlendirilebilir.",
        "C'nin dikkat ve özen yükümlülüğüne aykırı davranmış olması nedeniyle kusurlu olduğu değerlendrilemez.",
        "C kişisi marketten alışveriş yapmıştır ve kasada ödeme yapmadan çıkmıştır.",
        "C kişisi kasada ödeme yapmadan marketten çıkmıştır.",
        "C kasaya ödeme yapması gerekirken yapmamıştır.",
        "C markete girdi ve sonra ödeme yapmadan çıktı.",
        "Şahıs aldığı ürünleri parasını ödemeden çıkmıştır.",
        "C kişisi ödeme yapmayı unutarak marketten çıkmıştır.",
        "C kişisi kesin unutkan birisidir ve ödeme yapmayı unutmuştur.",
        "C kişisi hırsızdır ve hırsızlık suçu işlediği için bu durudman şüphe bile edilemez.",
        "C'nin dikkat ve özen yükümlülüğüne aykırı davranmış olması nedeniyle kusurlu olduğu değerlendrilemez.",
        "C kişisi ödeme yapmadı sonra da marketten çıkarken ödemeyi unuttu.",
        "C kişisi kötü bir insan.",
        "Ben C kişisinin kötü birisi olduğunu biliyorum.",
        "C kişisi iyi bir insan değil.",
        "Kötü bir insan olan C kişisi, ödeme yapmayı unuttuğunu iddia etmektedir.",
        "Kusurlu olan C kişisi, ödeme yapmayı unuttuğunu iddia etmektedir.",
        "C kişisi marketten çıkarken ödeme yapmayı unutmuştur.",
        "C kişisi marketten satın aldığı ürünleri kasada ödeme yapmadan çıkarmıştır.",
        "C'nin kasada ödeme yapmadığı, güvenlik kameralarıyla doğrulanmıştır.",
        "C kişisinin kasada ödeme yapmadan çıkması bilinçli bir eylem olarak değerlendirilebilir.",
        "C kasada ödeme yapmadığı için sorumlu tutulmalıdır.",
        "C kişisinin ödeme yapmadığına dair hiçbir kanıt bulunmamaktadır.",
        "Market çalışanları, C'nin ödeme yapmadığını fark etmiştir.",
        "C kişisi ödeme yapmayı unuttuğunu savunmaktadır.",
        "C'nin kasada ödeme yapmadığı iddiası asılsızdır.",
        "C'nin kasada ödeme yapmaması kasıtlı bir davranış olarak değerlendirilemez.",
        "C, dikkat eksikliği nedeniyle ödeme yapmayı unutmuş olabilir.",
        "C kişisi ödeme yapmadan çıkmayı bir hata olarak tanımlamıştır.",
        "C'nin kasada ödeme yapmadığı, güvenlik kayıtlarıyla teyit edilmiştir.",
        "C'nin kasadan ödeme yapmadan ayrılması bilinçli bir davranış olarak nitelendirilebilir.",
        "C, kasada ödeme yapmadığı için sorumluluk almalıdır.",
        "C'nin ödeme yapmadığına dair herhangi bir kanıt yoktur.",
        "Market çalışanları, C’nin kasada ödeme yapmadığını fark etti.",
        "C kişisi, ödeme yapmayı unuttuğunu iddia ediyor.",
        "C'nin kasada ödeme yapmadığı iddiası gerçeği yansıtmamaktadır.",
        "C'nin ödeme yapmaması kasıtlı olarak değerlendirilemez.",
        "C'nin dikkatsizliği yüzünden ödemeyi unutmuş olabileceği düşünülüyor.",
        "C kişisi, ödeme yapmadan ayrılmayı bir hata olarak kabul etmiştir."
    ]


    # Create log and excel files
    create_log(f"{model_name}_log.txt", "Score is calculated in the range of 0-1. Higher score indicates higher similarity.")  
    create_log("model_exec_times.txt", f"This file contains the execution times of the models. Number of test_sentences: {len(test_sentences)}")

    if model_name == "nli_model":
        create_excel_file(f"{model_name}_log.xlsx", "Results", {"Sentence1": [], "Sentence2": [], "label": [], "score": []})
    elif model_name == "sentiment_analysis":
        create_excel_file(f"{model_name}_log.xlsx", "Results", {"Sentence1": [], "Sentence2": [], "sentiment_1": [], "sentiment_2": []})
    else:
        create_excel_file(f"{model_name}_log.xlsx", "Results", {"Sentence1": [], "Sentence2": [], "Result": []})

    # Compare the sentences
    compared_sentence_pairs = []
    for index, sentence in enumerate(test_sentences):
        for index2, sentence2 in enumerate(test_sentences):
            if index != index2 and ( ( sentence, sentence2 ) not in compared_sentence_pairs\
                               and   ( sentence2, sentence ) not in compared_sentence_pairs):

                # Calculate the similarity is a essential function of SentenceComparator classes
                result = model.calculate_similarity(sentence, sentence2)
                
                append_to_log(f"{model_name}_log.txt", f"\nSentence1: {sentence}\nSentence2: {sentence2}\nResult: {result}")
                if model_name == "nli_model":
                    append_to_excel(f"{model_name}_log.xlsx", "Results", {"Sentence1": sentence, "Sentence2": sentence2, "label": result["label"], "score": result["score"]})
                elif model_name == "sentiment_analysis":
                    append_to_excel(f"{model_name}_log.xlsx", "Results", {"Sentence1": sentence, "Sentence2": sentence2, "sentiment_1": result[0], "sentiment_2": result[1]})
                else:
                    append_to_excel(f"{model_name}_log.xlsx", "Results", {"Sentence1": sentence, "Sentence2": sentence2, "Result": result})
                
                computation_count += 1
                compared_sentence_pairs.append((sentence, sentence2))
        #break # Delete this line for nested for :)

    end = time.time()
    append_to_log("model_exec_times.txt", f"{model_name} Avg comparison time: {(end - start) / computation_count} seconds, Total time: {end - start} seconds")
    append_to_log(f"{model_name}_log.txt", f"\nTotal time: {end - start} seconds")
    print(f"Total time: {end - start} seconds") 


In [59]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

#nli_model = SentenceComparator_NLI("microsoft/deberta-large-mnli")
#test_model(nli_model, "nli_model")
#
#semantic_similarity = SentenceComparator_semantic("paraphrase-MiniLM-L6-v2")
#test_model( semantic_similarity, "semantic_similarity")
#
#bert_cosine_similarity = SentenceComparator_bert_cosine("bert-base-multilingual-cased")
#test_model(bert_cosine_similarity, "bert_cosine_similarity")
#
#sbert_similarity = SentenceComparator_SBERT("paraphrase-multilingual-mpnet-base-v2")
#test_model(sbert_similarity, "sbert_similarity")
#
#word2vec_sim = SentenceComparator_Word2Vec("utils/word2vec/w2v_.model")
#test_model(word2vec_sim, "word2vec_sim")
#
#sentiment_analysis = SentenceComparator_sentiment_analysis()
#test_model(sentiment_analysis, "sentiment_analysis")
#
#sys_prompt= "Sen bir text-miner algoritmasın.\
#                Cümleleri sadece anlamsal olarak değerlendir.\
#                İstenen dönüş: değerlendirme:<benzer anlam->1, farklı anlam->0>.\
#                Bu formate göre bir dönüş sağla ve sadece anlama odaklan."
#
#ollama_model_llama3 = SentenceComparator_Ollama(
#    llama_version="llama3.1",
#    modelfile_system= sys_prompt,
#    temperature=0.4
#)
#test_model(ollama_model_llama3, "ollama_model_llama3.1")


In [95]:
########################################

# For bert_cos_sim
bert_cos_sim = excel_to_df("bert_cosine_similarity_log.xlsx", "Results")
bert_cos_sim["Result"] = bert_cos_sim["Result"].str.replace(r'[\[\]]','',regex=True).astype(float)

########################################

# For sbert_similarity
sbert_cos_df = excel_to_df("sbert_similarity_log.xlsx", "Results")
sbert_cos_df["Result"] = sbert_cos_df["Result"].str.replace(r'[\[\]()tensor]','',regex=True).astype(float)

########################################

# For nli_model_log.xlsx
nli_df = excel_to_df("nli_model_log.xlsx", "Results")

# Dummy encoding
dummy = pd.get_dummies(nli_df["label"])
nli_df.drop("label", axis=1, inplace=True)
nli_df = pd.concat([nli_df, dummy], axis=1)

nli_df.rename(columns={"score":"Result"}, inplace=True)

########################################

# For semantic_similarity_log.xlsx
semantic_df = excel_to_df("semantic_similarity_log.xlsx", "Results")
semantic_df["Result"] = semantic_df["Result"].str.replace(r'[\[\]()tensor]','',regex=True).astype(float)

########################################

# For word2vec_sim_log.xlsx
word2vec_df = excel_to_df("word2vec_sim_log.xlsx", "Results")

########################################

# Sentiment Analysis
sentiment_df = excel_to_df("sentiment_analysis_log.xlsx", "Results")

"""
There are two options for processing sentiment analysis results.

1. We can calculate sentiment score as a difference between sentiment_1 and sentiment_2. 
If they are equal, the score will be 1. Otherwise, the score will be 0. With this approach, we have a binary classification problem.

2. We can use sentiment_1 and sentiment_2 as two separate features. With this approach, 
we have a multi-class classification problem. The first sentence have two labels, and the second sentence have two labels.
Threfore, we have 4 labels in total.


# Dummy encoding Option 2
dummy_1 = pd.get_dummies(sentiment_df["sentiment_1"])
dummy_2 = pd.get_dummies(sentiment_df["sentiment_2"])

rename_all_columns = lambda df, suffix: df.rename(columns={col: col + suffix for col in df.columns})

dummy_1 = rename_all_columns(dummy_1, "_setnence1")
dummy_2 = rename_all_columns(dummy_2, "_setnence2")

sentiment_df.drop(["sentiment_1", "sentiment_2"], axis=1, inplace=True)
sentiment_df = pd.concat([sentiment_df, dummy_1, dummy_2], axis=1)
"""

# Dummy encoding Option 1
sentiment_df["Result"] = (sentiment_df["sentiment_1"] == sentiment_df["sentiment_2"]) * 1
sentiment_df.drop(["sentiment_1", "sentiment_2"], axis=1, inplace=True)

########################################

# For ollama_model_llama3.1_log.xlsx
ollama_df = excel_to_df("ollama_model_llama3.1_log.xlsx", "Results")
ollama_df["Result"] = ollama_df["Result"].str.replace(r'[\[\]()tensorDdeğerlendirme:Cüaıbzkfakı .23456789]','',regex=True).astype(int)
ollama_df

# Concatenate all the results
#all_results = pd.concat([bert_cos_sim["Sentence1"],bert_cos_sim["Sentence2"],bert_cos_sim["Result"], sbert_cos_df["Result"], nli_df["Result"], semantic_df["Result"], word2vec_df["Result"], sentiment_df["Result"], ollama_df["Result"]],
#                        axis=1, 
#                        keys=["Sentence1", "Sentence2", "bert_cos_sim", "sbert_cos_sim", "nli_model", "semantic_similarity", "word2vec_sim", "sentiment_analysis", "ollama_model_llama3.1"])


def concat_columns_except_sentences(df_list, df_list_names):
    initial_df = df_list[0].iloc[:,:2]
    for index, df in enumerate(df_list):
        df_except_sentences = df[ df.columns.difference(["Sentence1", "Sentence2"]) ] * 1
        df_except_sentences.columns = [f"{df_list_names[index]}_{col}" for col in df_except_sentences.columns]
        initial_df = pd.concat([initial_df, df_except_sentences], axis=1)
    return initial_df

all_results = concat_columns_except_sentences(
    [bert_cos_sim, sbert_cos_df, nli_df, semantic_df, word2vec_df, sentiment_df, ollama_df], 
    ["bert_cos_sim", "sbert_cos_sim", "nli_model", "semantic_similarity", "word2vec_sim", "sentiment_analysis", "ollama_model_llama3.1"])
             
all_results

Unnamed: 0,Sentence1,Sentence2,bert_cos_sim_Result,sbert_cos_sim_Result,nli_model_CONTRADICTION,nli_model_ENTAILMENT,nli_model_NEUTRAL,nli_model_Result,semantic_similarity_Result,word2vec_sim_Result,sentiment_analysis_Result,ollama_model_llama3.1_Result
0,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,0.964577,0.9383,0,1,0,0.769148,0.9611,0.946022,0,0
1,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,C kişisi marketten alışveriş yapmıştır ve kasa...,0.636648,0.1996,1,0,0,0.589680,0.7214,0.398693,1,1
2,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,C kişisi kasada ödeme yapmadan marketten çıkmı...,0.625543,0.1646,1,0,0,0.747958,0.7042,0.312992,1,1
3,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,C kasaya ödeme yapması gerekirken yapmamıştır.,0.657169,0.4852,1,0,0,0.710494,0.6675,0.391442,0,1
4,C'nin dikkat ve özen yükümlülüğüne aykırı davr...,C markete girdi ve sonra ödeme yapmadan çıktı.,0.587133,0.2732,1,0,0,0.737624,0.6462,0.396546,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
699,C'nin kasada ödeme yapmadığı iddiası gerçeği y...,C'nin dikkatsizliği yüzünden ödemeyi unutmuş o...,0.697819,0.7362,1,0,0,0.844054,0.6441,0.515694,0,1
700,C'nin kasada ödeme yapmadığı iddiası gerçeği y...,"C kişisi, ödeme yapmadan ayrılmayı bir hata ol...",0.718892,0.6233,1,0,0,0.514596,0.6645,0.496692,0,1
701,C'nin ödeme yapmaması kasıtlı olarak değerlend...,C'nin dikkatsizliği yüzünden ödemeyi unutmuş o...,0.703241,0.6767,1,0,0,0.601376,0.7230,0.457836,1,0
702,C'nin ödeme yapmaması kasıtlı olarak değerlend...,"C kişisi, ödeme yapmadan ayrılmayı bir hata ol...",0.764747,0.6354,1,0,0,0.867559,0.6490,0.658151,1,0


In [228]:
# Save the results to an excel file
#all_results.to_excel("log/all_results.xlsx", index=False)

# Read the results from the excel file
#all_results = pd.read_excel("log/all_results.xlsx")

# Calculate the correlation matrix
all_results_parameters = all_results.drop(["Sentence1", "Sentence2"], axis=1)


In [229]:
all_results_parameters.describe()

Unnamed: 0,bert_cos_sim_Result,sbert_cos_sim_Result,nli_model_CONTRADICTION,nli_model_ENTAILMENT,nli_model_NEUTRAL,nli_model_Result,semantic_similarity_Result,word2vec_sim_Result,sentiment_analysis_Result,ollama_model_llama3.1_Result
count,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0,704.0
mean,0.719333,0.55939,0.443182,0.099432,0.457386,0.636065,0.672279,0.520255,0.691761,0.301136
std,0.084696,0.179509,0.497114,0.299454,0.498535,0.151172,0.147471,0.131406,0.462094,0.459078
min,0.433617,0.0898,0.0,0.0,0.0,0.337423,0.167,0.244097,0.0,0.0
25%,0.667096,0.444375,0.0,0.0,0.0,0.509205,0.5957,0.428049,0.0,0.0
50%,0.726638,0.55655,0.0,0.0,0.0,0.633499,0.7041,0.497618,1.0,0.0
75%,0.773496,0.694625,1.0,0.0,1.0,0.758388,0.77425,0.599536,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,0.986145,1.0,1.0,1.0,1.0


In [230]:
positive_sample_indexes = all_results_parameters[all_results_parameters["ollama_model_llama3.1_Result"] == 0].index

random_n_index = np.random.choice(positive_sample_indexes, 400)

all_results_parameters.drop(random_n_index, inplace=True)

In [231]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# Normalize lib
from sklearn.preprocessing import StandardScaler
# Test train split
from sklearn.model_selection import train_test_split

# Define the features and target
y = all_results_parameters["ollama_model_llama3.1_Result"]
X = all_results_parameters.drop("ollama_model_llama3.1_Result", axis=1)

normalized_X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, test_size=0.2, random_state=42)

In [232]:
# Calculate the accuracy
from sklearn.metrics import accuracy_score

# LOGISTIC REGRESSION
# Create the model
log_reg = LogisticRegression()

# Fit the model
log_reg.fit(X_train, y_train)

# Predict the target
y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Logistic Reg, Accuracy: ", accuracy)

Logistic Reg, Accuracy:  0.6588235294117647


In [233]:
X

Unnamed: 0,bert_cos_sim_Result,sbert_cos_sim_Result,nli_model_CONTRADICTION,nli_model_ENTAILMENT,nli_model_NEUTRAL,nli_model_Result,semantic_similarity_Result,word2vec_sim_Result,sentiment_analysis_Result
0,0.964577,0.9383,0,1,0,0.769148,0.9611,0.946022,0
1,0.636648,0.1996,1,0,0,0.589680,0.7214,0.398693,1
2,0.625543,0.1646,1,0,0,0.747958,0.7042,0.312992,1
3,0.657169,0.4852,1,0,0,0.710494,0.6675,0.391442,0
4,0.587133,0.2732,1,0,0,0.737624,0.6462,0.396546,1
...,...,...,...,...,...,...,...,...,...
698,0.687288,0.8754,0,1,0,0.416837,0.8487,0.561863,0
699,0.697819,0.7362,1,0,0,0.844054,0.6441,0.515694,0
700,0.718892,0.6233,1,0,0,0.514596,0.6645,0.496692,0
702,0.764747,0.6354,1,0,0,0.867559,0.6490,0.658151,1


In [234]:
log_reg.coef_[0]

array([ 0.70672523,  0.33109902,  0.0062056 ,  0.12326919, -0.08404978,
        0.24898955,  0.199046  , -0.59667092,  0.07934688])

In [235]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

def print_conf_matrix(y_test, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)

    precision = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[0][1])
    recall = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[1][0])

    print("Precision: ", precision)
    print("Recall: ", recall)

    print("True Positive: ", conf_matrix[1][1])
    print("True Negative: ", conf_matrix[0][0])
    print("False Positive: ", conf_matrix[0][1])
    print("False Negative: ", conf_matrix[1][0])
    
print_conf_matrix(y_test, y_pred)

Precision:  0.7083333333333334
Recall:  0.6938775510204082
True Positive:  34
True Negative:  22
False Positive:  14
False Negative:  15


In [219]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score

# Assuming X_train, y_train, X_test, y_test are defined
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

#model.fit(X_train, y_train, epochs=100, batch_size=5)


In [237]:
def keras_model_tuner(hp):
    hidden_layer_num = hp.Int('hidden_layer_num', min_value=0, max_value=3, step=1)
    layer_unit = hp.Int('layer_unit', min_value=32, max_value=128, step=32)

    model = models.Sequential()
    model.add(layers.Dense(layer_unit, activation='relu', input_shape=(X_train.shape[1],)))

    for i in range(hidden_layer_num):
        model.add(layers.Dense(layer_unit, activation='relu'))

    model.add(layers.Dense(1, activation='sigmoid'))    

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

from kerastuner import RandomSearch

tuner = RandomSearch(
    keras_model_tuner,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='models',
    project_name='ollama_model_50pos_50neg_normalized'
)


tuner.search(X_train, y_train, epochs=30, batch_size = 4, validation_data=(X_test, y_test))

Trial 9 Complete [00h 00m 27s]
val_accuracy: 0.6745098233222961

Best val_accuracy So Far: 0.6901960968971252
Total elapsed time: 00h 04m 12s

Search: Running Trial #10

Value             |Best Value So Far |Hyperparameter
1                 |3                 |hidden_layer_num
64                |128               |layer_unit

Epoch 1/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5340 - loss: 0.6849 - val_accuracy: 0.5765 - val_loss: 0.6694
Epoch 2/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6227 - loss: 0.6173 - val_accuracy: 0.6000 - val_loss: 0.6606
Epoch 3/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6658 - loss: 0.5802 - val_accuracy: 0.6353 - val_loss: 0.6398
Epoch 4/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6441 - loss: 0.5973 - val_accuracy: 0.6706 - val_loss: 0.6499
Epoch 5/30
[1m84/84[0m [32m━━

In [196]:
from kerastuner.tuners import RandomSearch

tuner = RandomSearch(
    keras_model_tuner,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=5,
    directory='log',
    project_name='ollama_model8'
)

tuner.reload()
model = tuner.get_best_models(num_models=1)[0]

y_pred = model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)

print_conf_matrix(y_test, y_pred)

Reloading Tuner from log\ollama_model8\tuner0.json
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Accuracy:  0.6046511627906976
Precision:  0.625
Recall:  0.6521739130434783
True Positive:  30
True Negative:  22
False Positive:  18
False Negative:  16


In [197]:
def add_output_columns(df, y_test, y_pred):
    """
    This function adds the Confidence, Real Value, and Prediction columns to the dataframe.

    Args:
        df: A pandas dataframe.

    Returns:
        df: A pandas dataframe.
    """
    if type(df) != pd.DataFrame:
        df = pd.DataFrame(df) 
    df["Confidence"] = 1.0
    df["Real Value"] = 5
    df["Prediction"] = 5
    df["Accuracy"] = 5
    #print(df.head())
    df.reset_index(drop=True, inplace=True)
    
    for index, val in enumerate(y_pred):
        
        real =  y_test.iloc[index]
        pred = 0 if val < 0.5 else 1
        confidence = (val-0.5)*2 if pred == 1 else (0.5-val)*2
        
        df["Confidence"][index] = confidence
        df["Real Value"][index] = real
        df["Prediction"][index] = pred
        df["Accuracy"][index] = 1 if real == pred else 0

    return df

def calculate_accuracy(x,y,predictor):
    """
    This function calculates the accuracy of the predictor.

    Args:
        x: A pandas dataframe.
        y: A pandas dataframe.
        predictor: A predictor model.

    Returns:
        accuracy: A float.
    """
    y_pred = predictor.predict(x)
    y_pred = [1 if val > 0.5 else 0 for val in y_pred]
    accuracy = accuracy_score(y, y_pred)
    return accuracy

In [198]:
# Test
print("Test Accuracy: ", calculate_accuracy(X_test, y_test, model))
# Train
print("Train Accuracy: ", calculate_accuracy(X_train, y_train, model))
# Entire
print("Entire Accuracy: ", calculate_accuracy(normalized_X, y, model)) 

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Test Accuracy:  0.6046511627906976
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 755us/step
Train Accuracy:  0.6501457725947521
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 847us/step
Entire Accuracy:  0.6410256410256411


In [200]:
#disable warnings
import warnings
warnings.filterwarnings("ignore")

# TEST DATASET

high_accuracy_limit = 0.8

keras_predict_df_test = pd.DataFrame(model.predict(X_test), columns=["Prediction"])

result_df_test = add_output_columns(X_test, y_test, keras_predict_df_test["Prediction"])

faults_test = result_df_test[result_df_test["Accuracy"] == 0]

high_acc = result_df_test[result_df_test["Confidence"] > high_accuracy_limit]["Accuracy"].value_counts()

high_acc = high_acc = [high_acc[1], 0] if len(high_acc) == 1 else high_acc

print("Error rate: %", 100* high_acc[0] / (high_acc[0] + high_acc[1]), "      Number of high confidence predictions: ", high_acc[0] + high_acc[1])

print("Number of faults: ", faults_test.shape[0], "   Faults from high confidence predictions: ", faults_test[ faults_test["Confidence"] > high_accuracy_limit ].shape[0])
faults_test

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Error rate: % 100.0       Number of high confidence predictions:  2
Number of faults:  34    Faults from high confidence predictions:  0


Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
0,-0.618687,0.002055,1.105802,-0.346561,-0.891616,-1.051163,-1.413313,-1.222307,0.684966,0.178935,1,0,0
2,0.981597,0.941479,-0.904321,-0.346561,1.121559,1.527866,0.423428,0.551337,0.684966,0.325356,0,1,0
10,-0.127971,0.892964,-0.904321,2.885496,-0.891616,-0.765766,1.021323,-0.584252,0.684966,0.377188,0,1,0
13,0.335093,0.62503,1.105802,-0.346561,-0.891616,-0.708254,1.279648,1.337684,-1.459927,0.179028,0,1,0
14,0.47323,-0.651793,1.105802,-0.346561,-0.891616,-0.719385,0.213713,-0.133534,-1.459927,0.069303,0,1,0
15,-1.259652,-0.856328,-0.904321,-0.346561,1.121559,-1.03431,-0.895276,-0.834946,0.684966,0.456316,1,0,0
16,0.29368,0.824051,1.105802,-0.346561,-0.891616,-0.255231,0.345653,0.551726,0.684966,0.202754,0,1,0
17,-0.205655,0.829564,-0.904321,2.885496,-0.891616,-0.029443,1.155347,0.056034,0.684966,0.426191,0,1,0
18,-0.885727,0.085853,1.105802,-0.346561,-0.891616,-1.424523,0.011637,-0.350788,-1.459927,0.121288,1,0,0
22,-0.728865,-0.610445,-0.904321,-0.346561,1.121559,0.754404,-0.103637,0.534341,0.684966,0.361639,1,0,0


In [202]:
faults_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,-0.111762,0.059796,0.278104,-0.061379,-0.240295,-0.039639,-0.013587,0.03368,0.180285,0.259462,0.470588,0.529412,0.0
std,0.997712,1.037934,1.004165,0.930516,0.955972,0.981881,0.967461,0.998958,0.923508,0.173302,0.50664,0.50664,0.0
min,-1.797458,-2.671776,-0.904321,-0.346561,-0.891616,-1.474644,-3.050755,-1.394792,-1.459927,0.007974,0.0,0.0,0.0
25%,-0.872411,-0.600797,-0.904321,-0.346561,-0.891616,-0.797092,-0.743719,-0.765254,0.684966,0.128287,0.0,0.0,0.0
50%,-0.16983,0.254277,1.105802,-0.346561,-0.891616,-0.169266,0.182811,-0.174218,0.684966,0.198855,0.0,1.0,0.0
75%,0.581877,0.792765,1.105802,-0.346561,1.121559,0.746169,0.532799,0.551629,0.684966,0.39541,1.0,1.0,0.0
max,2.31062,2.147735,1.105802,2.885496,1.121559,2.147658,1.316452,3.055103,0.684966,0.768824,1.0,1.0,0.0


In [203]:
result_df_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,0.025358,0.128861,0.007246,0.179588,-0.119119,-0.05263,0.138005,0.053645,0.161213,0.234797,0.534884,0.55814,0.604651
std,1.047066,1.021581,1.006573,1.200192,0.984731,1.089204,0.923242,1.086535,0.926858,0.199757,0.501707,0.499521,0.491793
min,-1.797458,-2.671776,-0.904321,-0.346561,-0.891616,-1.825107,-3.050755,-1.755123,-1.459927,0.00768,0.0,0.0,0.0
25%,-0.72375,-0.607964,-0.904321,-0.346561,-0.891616,-1.00583,-0.260402,-0.732861,0.684966,0.071697,0.0,0.0,0.0
50%,0.001916,0.136022,-0.904321,-0.346561,-0.891616,-0.080594,0.257809,-0.229966,0.684966,0.187721,1.0,1.0,1.0
75%,0.625252,0.828186,1.105802,-0.346561,1.121559,0.868875,0.752757,0.64175,0.684966,0.378065,1.0,1.0,1.0
max,3.270909,2.328012,1.105802,2.885496,1.121559,2.21758,2.190727,3.547197,0.684966,0.864057,1.0,1.0,1.0


In [204]:
print_conf_matrix(result_df_test["Real Value"], result_df_test["Prediction"])

Precision:  0.625
Recall:  0.6521739130434783
True Positive:  30
True Negative:  22
False Positive:  18
False Negative:  16


In [206]:
# ENTIRE DATASET
# Predicts
keras_predict_df = pd.DataFrame(model.predict(normalized_X), columns=["Prediction"])

#sort the dataframe by ID
result_df = add_output_columns( 
    df = normalized_X,
    y_test = y,
    y_pred = keras_predict_df["Prediction"]
)
faults = result_df[ result_df["Accuracy"] == 0 ]

# Accuracy counts of the model where the confidence is greater than high_accuracy_limit
high_acc = result_df[result_df["Confidence"] > high_accuracy_limit]["Accuracy"].value_counts()

if len(high_acc) == 1:
    high_acc = [high_acc[1], 0]

print("Error rate: %", 100* high_acc[0] / (high_acc[0] + high_acc[1]), "      Number of high confidence predictions: ", high_acc[0] + high_acc[1])

print("Number of faults: ", faults.shape[0], "   Faults from high confidence predictions: ", faults[ faults["Confidence"] > high_accuracy_limit ].shape[0])
faults

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 923us/step
Error rate: % 100.0       Number of high confidence predictions:  14
Number of faults:  154    Faults from high confidence predictions:  0


Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
0,-1.065948,-2.084636,1.105802,-0.346561,-0.891616,-0.325306,0.256073,-0.913435,0.684966,0.104033,1,0,0
1,-1.198494,-2.277593,1.105802,-0.346561,-0.891616,0.689877,0.136632,-1.549180,0.684966,0.055956,1,0,0
2,-0.821015,-0.510108,1.105802,-0.346561,-0.891616,0.449582,-0.118220,-0.967224,-1.459927,0.121077,1,0,0
3,-1.448623,-1.952322,1.105802,-0.346561,-0.891616,-0.878272,0.349125,-1.628139,0.684966,0.049463,1,0,0
7,-0.789533,-1.281384,1.105802,-0.346561,-0.891616,-0.111795,0.315793,-1.308072,-1.459927,0.052908,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,0.813810,0.456882,1.105802,-0.346561,-0.891616,-0.753365,0.750500,-0.347018,0.684966,0.201854,0,1,0
416,0.300665,0.917222,-0.904321,-0.346561,1.121559,-1.011831,0.441483,0.783276,0.684966,0.096689,1,0,0
420,0.148116,0.371429,1.105802,-0.346561,-0.891616,0.721466,-0.699449,1.052366,0.684966,0.041267,0,1,0
421,-0.618687,0.002055,1.105802,-0.346561,-0.891616,-1.051163,-1.413313,-1.222307,0.684966,0.178935,1,0,0


In [158]:
result_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
count,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0,504.0
mean,-5.639228e-17,3.383537e-16,3.5245180000000005e-17,-2.1147110000000002e-17,-3.5245180000000005e-17,-2.1147110000000002e-17,8.458842000000001e-17,1.127846e-16,1.409807e-16,0.572476,0.420635,0.430556,0.823413
std,1.000994,1.000994,1.000994,1.000994,1.000994,1.000994,1.000994,1.000994,1.000994,0.31556,0.494151,0.495646,0.381698
min,-3.473657,-2.660679,-0.8520724,-0.3355336,-0.9572616,-1.97162,-3.035407,-2.139442,-1.581139,0.004024,0.0,0.0,0.0
25%,-0.5744611,-0.6732226,-0.8520724,-0.3355336,-0.9572616,-0.847364,-0.5471043,-0.6426903,-1.581139,0.29857,0.0,0.0,1.0
50%,0.1195077,0.01184927,-0.8520724,-0.3355336,-0.9572616,-0.01163927,0.179427,-0.1811255,0.6324555,0.605217,0.0,0.0,1.0
75%,0.6205919,0.7465837,1.173609,-0.3355336,1.044646,0.7909264,0.6812458,0.5593387,0.6324555,0.863125,1.0,1.0,1.0
max,3.224327,2.293064,1.173609,2.980328,1.044646,2.249869,2.182045,3.643392,0.6324555,0.999964,1.0,1.0,1.0


In [207]:
high_confidence = result_df[ result_df["Confidence"] > high_accuracy_limit ]

high_confidence["Accuracy"].value_counts()

Accuracy
1    14
Name: count, dtype: int64

In [209]:
high_confidence.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,-0.909004,-0.56701,-0.617161,0.576884,0.25877,0.6193,-1.437072,-0.319951,-0.693894,0.833915,0.285714,0.285714,1.0
std,2.381493,1.772461,0.729949,1.515212,1.033871,1.135221,2.177646,1.994235,1.066538,0.019445,0.468807,0.468807,0.0
min,-3.234867,-2.338787,-0.904321,-0.346561,-0.891616,-0.980405,-3.254915,-2.060255,-1.459927,0.800247,0.0,0.0,1.0
25%,-2.498286,-1.740208,-0.904321,-0.346561,-0.891616,-0.209275,-2.875935,-1.939765,-1.459927,0.82533,0.0,0.0,1.0
50%,-2.061127,-1.376208,-0.904321,-0.346561,1.121559,0.310105,-2.65945,-1.303686,-1.459927,0.838223,0.0,0.0,1.0
75%,1.275645,1.161863,-0.904321,2.077482,1.121559,1.769475,0.498425,1.509869,0.684966,0.847431,0.75,0.75,1.0
max,3.270909,2.328012,1.105802,2.885496,1.121559,2.21758,2.190727,3.547197,0.684966,0.864057,1.0,1.0,1.0


In [210]:
high_confidence

Unnamed: 0,0,1,2,3,4,5,6,7,8,Confidence,Real Value,Prediction,Accuracy
33,3.270909,2.328012,-0.904321,2.885496,-0.891616,2.21758,2.190727,3.547197,0.684966,0.864057,1,1,1
220,-1.509495,-2.338787,-0.904321,-0.346561,1.121559,0.842419,-2.795209,-1.755123,-1.459927,0.849083,0,0,1
221,-2.387759,-1.784174,1.105802,-0.346561,-0.891616,-0.243738,-3.254915,-2.060255,-1.459927,0.84274,0,0,1
223,-2.811472,-1.056451,-0.904321,-0.346561,1.121559,-0.240831,-2.828541,-1.897164,-1.459927,0.856455,0,0,1
224,-1.371192,-1.608308,-0.904321,-0.346561,1.121559,-0.014721,-2.687574,-0.412978,-1.459927,0.807007,0,0,1
227,-2.524212,-2.000837,1.105802,-0.346561,-0.891616,0.352234,-3.063255,-2.01068,-1.459927,0.848994,0,0,1
229,-3.234867,-1.378965,-0.904321,-0.346561,1.121559,0.267976,-2.891733,-1.203644,0.684966,0.832037,0,0,1
230,-1.869958,-1.373452,-0.904321,-0.346561,1.121559,-0.114605,-2.8959,-1.403728,-1.459927,0.835012,0,0,1
249,-2.252297,-1.917039,-0.904321,-0.346561,1.121559,1.340385,-1.828576,0.16295,-1.459927,0.800247,0,0,1
251,-2.420506,-1.292961,-0.904321,-0.346561,1.121559,-0.980405,-2.631326,-1.995278,-1.459927,0.825317,0,0,1
