In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('/kaggle/input/fake-reviews/fake reviews dataset.csv')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text_'])
X = tokenizer.texts_to_sequences(data['text_'])
X = pad_sequences(X)  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

In [3]:
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    print(f"Training fold {fold}/{n_splits}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    catboost_regressor = CatBoostRegressor(verbose=False)
    
    catboost_regressor.fit(X_train, y_train)

    y_pred = catboost_regressor.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred.round())
    print(f"Validation Accuracy: {accuracy}")

Training fold 1/3
Validation Accuracy: 0.8918979077014394
Training fold 2/3
Validation Accuracy: 0.8932254952882689
Training fold 3/3
Validation Accuracy: 0.8876604585590265


In [4]:
def filter_original_reviews(reviews):
    original_reviews = []
    for review in reviews:
        review_seq = tokenizer.texts_to_sequences([review])
        review_padded = pad_sequences(review_seq, maxlen=X.shape[1])
        predicted_prob = catboost_regressor.predict(review_padded)
        
        predicted_label = 'CG' if predicted_prob < 0.5 else 'OR'
        
        if predicted_label == 'OR':
            original_reviews.append(review)
    
    return original_reviews

In [5]:
reviews = ["This product is amazing!", "The quality of this product is top-notch.", "I am very satisfied with this product.", "This review was written by a computer."]
original_reviews = filter_original_reviews(reviews)
print("Original Reviews:", original_reviews)

Original Reviews: ['The quality of this product is top-notch.', 'I am very satisfied with this product.', 'This review was written by a computer.']


In [6]:
catboost_regressor.save_model('/kaggle/working/catboost_model.cbm')