In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-reviews/fake_reviews.csv


In [2]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import RobertaTokenizer, TFRobertaModel

from collections import Counter

import warnings
warnings.filterwarnings("ignore")



In [3]:
# Detect hardware, return appropriate distribution strategy (you can see that it is pretty easy to set up).
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [4]:
MODEL_NAME = 'roberta-base'
MAX_LEN = 256
ARTIFACTS_PATH = '../artifacts/'

BATCH_SIZE = 8 * strategy.num_replicas_in_sync
EPOCHS = 3

if not os.path.exists(ARTIFACTS_PATH):
    os.makedirs(ARTIFACTS_PATH)

In [4]:
df = pd.read_csv('/kaggle/input/fake-reviews/fake_reviews.csv')
df.head()

Unnamed: 0,review_headline,review_body,fake_review,cleaned_review_body
0,★ THESE REALLY DO WORK GREAT WITH SOME TWEAKING ★,"These Really Do Work Great, But You Do Need To...",0,work great need know thing year paid dollar pr...
1,Favorite for winter. Very warm!,I love this dress. Absolute favorite for winte...,0,love dress absolute favorite winter heavy mate...
2,Great Socks for the money.,"Nice socks, great colors, just enough support ...",0,nice sock great color support wearing good pai...
3,Slick hat!,"I bought this for my husband and WOW, this is ...",0,bought husband slick high quality craftsmanshi...
4,I would do it again!,Perfect dress and the customer service was awe...,0,perfect dress customer service awesomei


In [5]:
!pip install neattext
import neattext as nt
def text_preprocessing(text):
  text=nt.fix_contractions(text)     #I'm -> I am
  text=nt.remove_urls(text)          #removing urls
  text=nt.remove_non_ascii(text)     #removing non-ascii characters
  text=nt.remove_numbers(text)       #removing numbers
  text=nt.remove_multiple_spaces(text)  #removing multiple spaces
  return text

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3
[0m

In [6]:
df['cleaned_review_body']=df.review_body.apply(lambda x:text_preprocessing(str(x)))

In [7]:
def postprocessing(text):
  text=nt.remove_stopwords(text)
  text=nt.remove_special_characters(text)
  text=nt.remove_emojis(text)
  text=nt.remove_shortwords(text,3)
  return text

In [8]:
df['cleaned_review_body']=df['cleaned_review_body'].apply(lambda x:postprocessing(str(x)))

In [9]:
print('Feature ',end=' ')
if(any(df.isnull().any())):
    print('Missing Data\n')
    print(df.isnull().sum())
else:
    print('NO missing data')

Feature  Missing Data

review_headline         2
review_body            36
fake_review             0
cleaned_review_body     0
dtype: int64


In [10]:
df['review_headline'] = df['review_headline'].fillna(df['review_headline'].mode()[0]) # Mode- 'Reuters'
df['review_body'] = df['review_body'].fillna(df['review_body'].mode()[0])
df['cleaned_review_body'] = df['cleaned_review_body'].fillna(df['cleaned_review_body'].mode()[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130000 entries, 0 to 129999
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   review_headline      130000 non-null  object
 1   review_body          130000 non-null  object
 2   fake_review          130000 non-null  int64 
 3   cleaned_review_body  130000 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.0+ MB


In [11]:
print('Data Size {}'.format(df.shape))
if(any(df.duplicated())==True):
    print('Duplicate rows found')
    print('Number of duplicate rows= ',df[df.duplicated()].shape[0])
    df.drop_duplicates(inplace=True,keep='first')
    df.reset_index(inplace=True,drop=True)
    print('Dropping duplicates\n')
    print(df.shape)
else:
    print('NO duplicate data')

Data Size (130000, 4)
Duplicate rows found
Number of duplicate rows=  11366
Dropping duplicates

(118634, 4)


In [12]:
df.drop(columns=['review_headline','review_body'],inplace=True)
df.head()

Unnamed: 0,fake_review,cleaned_review_body
0,0,work great need know things years first paid d...
1,0,love dress absolute favorite winter heavy mate...
2,0,nice socks great colors support wearing good p...
3,0,bought husband slick high quality craftsmanshi...
4,0,perfect dress customer service awesomei again


In [13]:
MAX_LEN = 256
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [14]:
# Transform categories into numbers
category_to_id = {}
category_to_name = {}
X_data = df['cleaned_review_body'].to_numpy().reshape(-1)
y_data = df['fake_review'].to_numpy().reshape(-1)
for index, c in enumerate(y_data):
    if c in category_to_id:
        category_id = category_to_id[c]
    else:
        category_id = len(category_to_id)
        category_to_id[c] = category_id
        category_to_name[category_id] = c
    
    y_data[index] = category_id

# Display dictionary
category_to_name

{0: 0, 1: 1}

In [28]:
# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=777) # random_state to reproduce results


In [29]:
X_train = roberta_encode(X_train, tokenizer)
X_test = roberta_encode(X_test, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_test = np.asarray(y_test, dtype='int32')

In [30]:
def build_model(n_categories):
    with strategy.scope():
        input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
        input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
        input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        # Import RoBERTa model from HuggingFace
        roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)
        x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

        # Huggingface transformers have multiple outputs, embeddings are the first one,
        # so let's slice out the first position
        x = x[0]

        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

        model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=1e-5),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

        return model

In [31]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
categories = df['fake_review'].unique()
n_categories = len(categories)
print('n_categories', n_categories)
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)
with strategy.scope():
    model = build_model(n_categories)
    model.summary()

n_categories 2
Number of replicas: 1


Downloading tf_model.h5:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_word_ids[0][0]',         
 el)                            thPoolingAndCrossAt               'input_mask[0][0]',         

In [32]:
EPOCHS = 2
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
with strategy.scope():
    print('Training...')
    history = model.fit(X_train,
                        y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        verbose=1,
                        validation_data=(X_test, y_test))

Training...
Epoch 1/2
Epoch 2/2


In [33]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

Accuracy: 81.33%


In [37]:
from sklearn.metrics import classification_report
preds = model.predict(X_test)
if len(preds.shape) > 1 and preds.shape[1] > 1:
    # Convert predicted probabilities to class labels
    y_pred = preds.argmax(axis=1)
else:
    # Convert predicted probabilities to binary labels
    y_pred = (preds > 0.5).astype(int)

# Ensure y_true and y_pred have the same shape
y_true = y_test[:y_pred.shape[0]]  # Adjust the shape of y_true to match y_pred

# Generate classification report
report = classification_report(y_true, y_pred)

# Print the classification report
print(report)

              precision    recall  f1-score   support

           0       0.81      1.00      0.90     19296
           1       0.00      0.00      0.00      4431

    accuracy                           0.81     23727
   macro avg       0.41      0.50      0.45     23727
weighted avg       0.66      0.81      0.73     23727

