In [1]:
!pip install datasets
from datasets import load_dataset

dataset = load_dataset("PiC/phrase_similarity")





Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Features that could be important:

- Greedy word-wise similarity of phrase
- Lesk overlap with content words in context sentence


In [2]:
import pandas as pd


train = pd.DataFrame(dataset['train'])
test = pd.DataFrame(dataset['test'])
val = pd.DataFrame(dataset['validation'])

In [3]:
train

Unnamed: 0,phrase1,phrase2,sentence1,sentence2,label,idx
0,newly formed camp,recently made encampment,newly formed camp is released from the membran...,recently made encampment is released from the ...,0,0
1,one data,a particular statistic,"According to one data, in 1910, on others – in...","According to a particular statistic, in 1910, ...",1,1
2,particular structure,specific edifice,Note that Fact 1 does not assume any particula...,Note that Fact 1 does not assume any specific ...,0,2
3,involved people,participating individuals,Assessment-Center are usually group-processes ...,Assessment-Center are usually group-processes ...,1,3
4,different cross,opposing inquiries,"At the end of the 1980s, a different cross had...","At the end of the 1980s, a opposing inquiries ...",0,4
...,...,...,...,...,...,...
6999,similar notice,comparable notification,Nereus and Achilleus a similar notice was admi...,Nereus and Achilleus a comparable notification...,1,6999
7000,color map,painted chart,Color quantization is the process of creating ...,Color quantization is the process of creating ...,0,7000
7001,dutch tool,device from the people of the Netherlands,petra is a dutch tool with which patients and ...,petra is a device from the people of the Nethe...,1,7001
7002,secondary concern,less serious issue,"However, granting this access, especially to a...","However, granting this access, especially to a...",1,7002


In [4]:
import spacy
!python -m spacy download en_core_web_lg

nlp = spacy.load("en_core_web_lg")
doc = nlp("Assessment-Center are usually group-processes with high validity and acceptance of the involved people.")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Assessment NOUN compound
- PUNCT punct
Center PROPN nsubj
are AUX ROOT
usually ADV advmod
group NOUN compound
- PUNCT punct
processes NOUN attr
with ADP prep
high ADJ amod
validity NOUN pobj
and CCONJ cc
acceptance NOUN conj
of ADP prep
the DET det
involved ADJ amod
people NOUN pobj
. PUNCT punct


In [5]:
import nltk
nltk.download('brown')

from nltk.corpus import brown    # Consider changing corpus (more tokens, etc.)

[nltk_data] Downloading package brown to /home/hari/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [6]:
import gensim as gs
from gensim.models import FastText

md = FastText(brown.sents(), min_count=1, vector_size=100, window=5)

In [7]:
p1 = 'former charge'
p2 = 'previous responsibility'

def avg_sim(p1, p2):
  p1_nlp = nlp(p1)
  p2_nlp = nlp(p2)


  avg_similarity = 0.0
  for t1 in p1_nlp:
    max_similarity = 0.0
    if t1.tag_ in ['PUNCT']:
      continue
    for t2 in p2_nlp:
      if t2.tag_ in ['PUNCT']:
        continue
      sim = md.wv.similarity(t1.text, t2.text)
      if sim > max_similarity:
        max_similarity = sim
    avg_similarity += max_similarity
    # print(avg_similarity)

  avg_similarity /= len([token for token in p1_nlp if token.tag_ != 'PUNCT'])
  if avg_similarity > 1:
    return 1

  return avg_similarity

In [8]:
avg_sim(p1, p2)

0.8866112232208252

In [9]:
avg_sim('smooth one', 'polished item')

0.6802492141723633

In [10]:
avg_sim('blue bag', 'blue ranch flavored bag')

1.0

In [14]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/hari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hari/nltk_data...


In [15]:
def content_words_similarity(phrase, sentence):
  phrase_nlp = nlp(phrase)
  sent_nlp = nlp(sentence)

  root_phrase = phrase.split()[-1]

  content_words = []
  for token in sent_nlp:
    if token.tag_ in ('NN', 'NNS', 'NNP', 'NNPS'):
      content_words.append(token.text)

  # print(content_words)
  ents = []
  for ent in sent_nlp.ents:
    ents.append(ent.root.text)

  content_words = [word for word in content_words if word not in ents and word != root_phrase]

  avg_similarity = None
  for synset1 in wn.synsets(root_phrase):
    avg_similarity = 0.0
    for word in content_words:
      word_similarity = 0.0
      for synset2 in wn.synsets(word):
        sim = synset1.wup_similarity(synset2)
        if sim > word_similarity:
          word_similarity = sim
      avg_similarity += word_similarity
      # print(avg_similarity)

  if avg_similarity is None:
    return None

  try:
    avg_similarity /= len(content_words)
  except ZeroDivisionError:
    return None
  # print(content_words)

  return avg_similarity

In [16]:
content_words_similarity('one ensemble', 'Since 2004, the music department has taken one ensemble on a short UK tour in the summer term.')

0.26904761904761904

In [17]:
content_words_similarity('a group', 'Since 2004, the music department has taken a group on a short UK tour in the summer term.')

0.18672438672438674

In [18]:
train.shape

(7004, 6)

In [19]:
train_data = []
for _, row in train.head(7000).iterrows():
  train_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                     row.label])

test_data = []
for _, row in test.head(1000).iterrows():
  test_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                    row.label])



In [20]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

train_data = train_data.dropna()
test_data = test_data.dropna()

X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[269, 266],
       [218, 222]])

In [24]:
y_test

array([0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,

In [25]:
y_pred

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,

In [26]:
X_train.shape

(6845, 3)

In [27]:
X_test.shape

(975, 3)

In [28]:
precision_score(y_pred, y_test)

0.45491803278688525

In [29]:
recall_score(y_pred, y_test)

0.5045454545454545

In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.5035897435897436

In [31]:
from sklearn.svm import SVC

model = SVC()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[401, 366],
       [ 86, 122]])

In [32]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.5364102564102564

In [33]:
pd.DataFrame(zip(y_test, y_pred))

Unnamed: 0,0,1
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
970,0,0
971,0,0
972,1,0
973,1,0


In [34]:
list(y_test).count(0)

487

In [35]:
val_data = []
for _, row in val.head(2000).iterrows():
  val_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                    row.label])

val_data = pd.DataFrame(val_data)

val_data = val_data.dropna()

X_val = val_data.iloc[:, :-1].values
y_val = val_data.iloc[:, -1].values

In [36]:
X_val

array([[0.81324235, 0.17249417, 0.32492997],
       [0.96105299, 0.29444444, 0.30707283],
       [0.88428345, 0.18492618, 0.51470588],
       ...,
       [0.94342157, 0.40357143, 0.61912519],
       [0.94793642, 0.16233766, 0.35      ],
       [0.74468341, 0.18636364, 0.17189523]])

In [37]:
val_data.describe()

Unnamed: 0,0,1,2,3
count,972.0,972.0,972.0,972.0
mean,0.857349,0.263864,0.285261,0.502058
std,0.078738,0.110261,0.115464,0.500253
min,0.366946,0.0,0.0,0.0
25%,0.816156,0.192216,0.200162,0.0
50%,0.870231,0.240823,0.267253,1.0
75%,0.917461,0.311957,0.346151,1.0
max,0.993685,1.0,0.933333,1.0


In [38]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

In [39]:
confusion_matrix(y_pred, y_test)

array([[223, 284],
       [264, 204]])

In [40]:
accuracy_score(y_pred, y_test)

0.4379487179487179

In [41]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(3,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val))

2024-02-15 07:44:31.531970: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-15 07:44:31.587903: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-15 07:44:31.865034: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 07:44:31.865175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 07:44:31.919652: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x7f110024e6d0>

In [42]:
model.evaluate(X_test, y_test)



[0.6918063163757324, 0.5005128383636475]

In [43]:
y_pred = model.predict(X_test)

y_pred = [1 if i>0.5 else 0 for [i] in y_pred]



In [44]:
THRESHOLD = 0.5

y_pred = model.predict(X_test)
y_pred = [1 if i>THRESHOLD else 0 for [i] in y_pred]

confusion_matrix(y_pred, y_test)



array([[293, 293],
       [194, 195]])

In [45]:
accuracy_score(y_pred, y_test)

0.5005128205128205

# Unconstrained

Same features, except use similarity from `all-MiniLM-L6-v2` transformer instead of w2v word similarity, etc

In [54]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

bert = SentenceTransformer('all-MiniLM-L6-v2')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [55]:
import numpy as np

def similarity(w1, w2):
  v1, v2 = tuple(bert.encode([w1, w2]))
  return np.dot(v1, v2) / (np.dot(v1, v1) * np.dot(v2, v2)) ** 0.5

def avg_sim(p1, p2):
  p1_nlp = nlp(p1)
  p2_nlp = nlp(p2)


  avg_similarity = 0.0
  for t1 in p1_nlp:
    max_similarity = 0.0
    if t1.tag_ in ['PUNCT']:
      continue
    for t2 in p2_nlp:
      if t2.tag_ in ['PUNCT']:
        continue
      sim = similarity(t1.text, t2.text)
      if sim > max_similarity:
        max_similarity = sim
    avg_similarity += max_similarity
    # print(avg_similarity)

  avg_similarity /= len([token for token in p1_nlp if token.tag_ != 'PUNCT'])
  if avg_similarity > 1:
    return 1

  return avg_similarity

In [56]:
similarity('good', 'bad')

0.587149346640746

In [57]:
def content_words_similarity(phrase, sentence):
  phrase_nlp = nlp(phrase)
  sent_nlp = nlp(sentence)

  root_phrase = phrase.split()[-1]

  content_words = []
  for token in sent_nlp:
    if token.tag_ in ('NN', 'NNS', 'NNP', 'NNPS'):
      content_words.append(token.text)

  # print(content_words)
  ents = []
  for ent in sent_nlp.ents:
    ents.append(ent.root.text)

  content_words = [word for word in content_words if word not in ents and word != root_phrase]

  sim = 1
  for word in content_words:
    sim *= (similarity(root_phrase, word))

  if sim == 1:
    return None

  return sim

In [58]:
train_data = []
for _, row in train.head(7000).iterrows():
  train_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                     row.label])

test_data = []
for _, row in test.head(1000).iterrows():
  test_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                    row.label])



In [59]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

train_data = train_data.dropna()
test_data = test_data.dropna()

X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

In [60]:
model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[310, 281],
       [189, 216]])

In [61]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.5281124497991968

In [62]:
from sklearn.svm import SVC

model = SVC()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[435, 434],
       [ 64,  63]])

In [63]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.5

In [64]:
val_data = []
for _, row in val.head(2000).iterrows():
  val_data.append([avg_sim(row.phrase1, row.phrase2),
                     content_words_similarity(row.phrase1, row.sentence1),
                     content_words_similarity(row.phrase2, row.sentence2),
                    row.label])

val_data = pd.DataFrame(val_data)

val_data = val_data.dropna()

X_val = val_data.iloc[:, :-1].values
y_val = val_data.iloc[:, -1].values

In [72]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

In [73]:
confusion_matrix(y_pred, y_test)

array([[265, 255],
       [234, 242]])

In [74]:
accuracy_score(y_pred, y_test)

0.5090361445783133

In [67]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(3,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x7f10f87a1b50>

In [68]:
model.evaluate(X_test, y_test)



[0.691016435623169, 0.5401606559753418]

In [69]:
y_pred = model.predict(X_test)

y_pred = [1 if i>0.5 else 0 for [i] in y_pred]



In [70]:
THRESHOLD = 0.5

y_pred = model.predict(X_test)
y_pred = [1 if i>THRESHOLD else 0 for [i] in y_pred]

confusion_matrix(y_pred, y_test)



array([[350, 309],
       [149, 188]])

In [71]:
accuracy_score(y_pred, y_test)

0.5401606425702812