<a href="https://colab.research.google.com/github/Anushree-B/Lie-detector/blob/main/Lie_detector_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk

In [2]:
tf.__version__

'2.17.0'

In [3]:
df = pd.read_csv("Data/politifact_cleaned.csv")

In [4]:
df.head()

Unnamed: 0,Quote,word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment,named_entities_count,Truth value
0,violent crime is near a record year low,8,4.0,0.0,1.0,4.0,1.0,1.0,-0.8591,0,
1,under joe biden there are record numbers of ...,11,5.181818,0.0,3.0,4.0,1.0,0.0,0.0,1,
2,today the gap between african american and wh...,18,4.5,0.0,5.0,4.0,2.0,2.0,0.0,4,
3,republicans have shown themselves willing to ...,29,5.862069,1.0,3.0,10.0,5.0,2.0,-0.7351,3,
4,says in the presidential election i won...,14,4.714286,0.0,1.0,5.0,3.0,1.0,0.5719,1,


# Applying neural network model

In [5]:
# Drop rows with NaN values
df = df.dropna(subset = 'Truth value')
print(df['Truth value'].isna().sum())
print(df['Truth value'].value_counts())

0
Truth value
2.0    394
0.0    256
1.0    243
Name: count, dtype: int64


In [6]:
X = df['Quote']
y = df['Truth value']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42,stratify=y)

In [7]:
# Vectorize the text

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [8]:
# Import the machine learning model and train it

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score

models = [LogisticRegression(max_iter=1000), DecisionTreeClassifier(), 
        RandomForestClassifier(n_estimators=200), SVC(), MLPClassifier(),
        MultinomialNB(), BernoulliNB()]
# Logistic Regression
for model in models:
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)
    print("Model: ", model)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred, average = 'weighted'))
    print("Recall: ", recall_score(y_test, y_pred, average = 'weighted'))
    print("Precision: ", precision_score(y_test, y_pred, average = 'weighted'))
    print(classification_report(y_test, y_pred))


Model:  LogisticRegression(max_iter=1000)
Accuracy:  0.4972067039106145
F1 Score:  0.4606722140464893
Recall:  0.4972067039106145
Precision:  0.4886224281237226
              precision    recall  f1-score   support

         0.0       0.44      0.29      0.35        51
         1.0       0.50      0.22      0.31        49
         2.0       0.51      0.80      0.62        79

    accuracy                           0.50       179
   macro avg       0.48      0.44      0.43       179
weighted avg       0.49      0.50      0.46       179

Model:  DecisionTreeClassifier()
Accuracy:  0.44692737430167595
F1 Score:  0.4369627108446084
Recall:  0.44692737430167595
Precision:  0.4329490377592928
              precision    recall  f1-score   support

         0.0       0.42      0.39      0.40        51
         1.0       0.32      0.24      0.28        49
         2.0       0.52      0.61      0.56        79

    accuracy                           0.45       179
   macro avg       0.42      0.4

TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.

from keras.utils import to_categorical
# One-hot encode the integer labels
y_one_hot = to_categorical(y, num_classes=3)

In [7]:
x = df.drop(columns = ['Truth value'])
x.shape

(893, 9)

In [8]:
y_one_hot.shape

(893, 3)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y_one_hot, test_size = 0.15, random_state = 42)

In [10]:
X_train

Unnamed: 0,word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment,named_entities_count
1046,20,4.300000,1.0,3.0,4.0,3.0,2.0,0.2960,0
1056,23,4.521739,1.0,1.0,8.0,5.0,3.0,-0.7239,0
464,17,5.588235,0.0,4.0,7.0,2.0,1.0,-0.1027,1
389,11,3.545455,1.0,1.0,3.0,2.0,1.0,0.0000,0
822,15,4.266667,1.0,1.0,5.0,5.0,0.0,0.0000,1
...,...,...,...,...,...,...,...,...,...
264,10,4.600000,0.0,2.0,5.0,1.0,0.0,-0.3818,1
428,12,4.083333,1.0,1.0,1.0,4.0,1.0,0.0000,0
1074,15,4.666667,0.0,2.0,3.0,2.0,2.0,-0.1280,1
593,14,4.214286,0.0,2.0,4.0,1.0,2.0,0.0000,1


In [11]:
y_train

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [12]:
y_test

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1

In [13]:
x.head()

Unnamed: 0,word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment,named_entities_count
153,12,4.5,1.0,0.0,3.0,5.0,0.0,0.25,0
154,11,5.454545,1.0,2.0,4.0,1.0,0.0,-0.7184,2
155,16,4.6875,0.0,0.0,7.0,5.0,0.0,-0.6124,2
156,10,4.7,1.0,2.0,3.0,2.0,0.0,0.0,0
157,23,4.478261,0.0,0.0,7.0,7.0,1.0,0.0,3


In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
# Building ann
#inititalising the ann
ann = tf.keras.models.Sequential()
# adding the input layer

ann.add(tf.keras.layers.Dense(units=9,input_shape = (9,), activation='relu'))
# adding the 1st hidden layer

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
# adding the seconf hidden layer

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
# adding the output layer

ann.add(tf.keras.layers.Dense(units=3, activation='softmax'))
# training the ann

# compiling the ann

ann.compile(optimizer = 'adam' , loss = 'categorical_crossentropy', metrics = ['accuracy']) # adam = commmon, stochastic gradient descrnt

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# train ann on the training set

ann.fit(X_train, y_train, batch_size = 8, epochs = 100, validation_split=0.1, callbacks=[tf.keras.callbacks.EarlyStopping(patience=4)])

Epoch 1/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.3517 - loss: 1.1046 - val_accuracy: 0.3421 - val_loss: 1.1392
Epoch 2/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4105 - loss: 1.1002 - val_accuracy: 0.3553 - val_loss: 1.1294
Epoch 3/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4282 - loss: 1.0762 - val_accuracy: 0.3553 - val_loss: 1.1305
Epoch 4/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4354 - loss: 1.0731 - val_accuracy: 0.3684 - val_loss: 1.1261
Epoch 5/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4551 - loss: 1.0645 - val_accuracy: 0.3684 - val_loss: 1.1240
Epoch 6/100
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4931 - loss: 1.0491 - val_accuracy: 0.3684 - val_loss: 1.1247
Epoch 7/100
[1m86/86[0m [32m━━━

<keras.src.callbacks.history.History at 0x1f2068cded0>

In [17]:
from sklearn.metrics import accuracy_score
y_pred = ann.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
accuracy_score = accuracy_score(y_true, y_pred_classes)
print("Accuracy score:", accuracy_score)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Accuracy score: 0.41044776119402987


# Predicting result with new quote

In [18]:
nltk.download('averaged_perceptron_tagger') #used for tagging words with their parts of speech (POS)
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon (if not already downloaded)
nltk.download('vader_lexicon')
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [20]:
def sentiment_score(text):
  # Create a Vader SentimentIntensityAnalyzer object
  analyzer = SentimentIntensityAnalyzer()
  # Get sentiment scores (compound score for overall sentiment)
  sentiment = analyzer.polarity_scores(text)
  return sentiment["compound"]

def named_entities(text):
  # Create a spaCy document
  doc = nlp(text)
  # Extract named entities and their labels (PERSON, ORG, etc.)
  entities = [(entity.text, entity.label_) for entity in doc.ents]
  return entities

In [21]:
def preprocess_quote(quote):
  adv_count = 0
  adj_count = 0
  noun_count = 0
  verb_count = 0
  det_count = 0
  words = quote.split()
  tagged_words = nltk.pos_tag(words)
  for word, tag in tagged_words:
    if tag.startswith('RB'):
      adv_count += 1
    elif tag.startswith('JJ'):
      adj_count += 1
    elif tag.startswith('NN'):
      noun_count += 1
    elif tag.startswith('VB'):
      verb_count += 1
    elif tag.startswith('DT'):
      det_count += 1
  word_count = len(quote.split())
  word_length = sum(len(word) for word in quote.split())
  sentiment = sentiment_score(quote)
  named_entitity = named_entities(quote)
  named_entities_count = len(named_entitity)

  return word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment, named_entities_count

In [22]:
word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment, named_entities_count = preprocess_quote("Hello i am anushree")

In [23]:
noun_count

2

In [43]:
new_df = pd.DataFrame(preprocess_quote("This is a project has a good accuracy and a bear made it.")).transpose()
new_df.columns = ['word_count','word_length','adv_count','adj_count','noun_count','verb_count','det_count','sentiment','named_entities_count']
new_df.head()

Unnamed: 0,word_count,word_length,adv_count,adj_count,noun_count,verb_count,det_count,sentiment,named_entities_count
0,13.0,45.0,0.0,1.0,4.0,3.0,4.0,0.4404,0.0


In [44]:
# Saving the pickle file for the scaler
import pickle
with open('scaler.pkl', 'wb') as file:
    pickle.dump(sc, file)

In [45]:
check = new_df
# Applying the scaling to the new data
check = sc.transform(check)
check

array([[-4.92575394e-01,  6.90387827e+01, -8.49699119e-01,
        -5.50795497e-01, -5.03405597e-01, -3.15970413e-02,
         2.77748773e+00,  1.10917783e+00, -1.25509348e+00]])

In [46]:
# Make predictions
new_quote_prediction = ann.predict(new_df)
new_quote_prediction_class = np.argmax(new_quote_prediction)

# Print the prediction
if new_quote_prediction_class == 0:
  print("The new quote is likely to be true.")
elif new_quote_prediction_class == 1:
  print("The new quote is likely to be mostly true or half-true.")
else:
  print("The new quote is likely to be false or pants-on-fire.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
The new quote is likely to be false or pants-on-fire.
