In [4]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [26]:
import math
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('omw-1.4')

stopwords = set(stopwords.words('english'))
sns.set_theme()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vrisandubey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vrisandubey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vrisandubey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/vrisandubey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vrisandubey/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
def preprocess_text(s):
    tokens = word_tokenize(s.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [12]:
reviews_df = pd.read_csv('../data/fake_reviews.csv')
reviews_df = reviews_df.rename(columns = {'text_': 'text'})
reviews_df['category'] = reviews_df['category'].apply(lambda s: s[:-2].replace('_', ' '))
reviews_df['rating'] = reviews_df['rating'].astype(int)

reviews_df['text_no_stop'] = reviews_df['text'].apply(lambda s: ' '.join([token for token in word_tokenize(s.lower()) if token not in stopwords]))
reviews_df['text_no_punct'] = reviews_df['text'].apply(lambda s: s.lower().translate(str.maketrans('', '', string.punctuation)))

reviews_df['lemma_text'] = reviews_df['text_no_punct'].apply(preprocess_text)

In [13]:
tfidf_df = pd.DataFrame()
tfidf_df['label'] = reviews_df['label']
tfidf_df['lemma_text'] = reviews_df['lemma_text']
tfidf_df['text'] = reviews_df['text'] 


sent_analyzer = SentimentIntensityAnalyzer()
tfidf_df['sentiment'] = reviews_df['lemma_text'].apply(lambda s: sent_analyzer.polarity_scores(s))
tfidf_df['neg_sentiment'] = tfidf_df.sentiment.apply(lambda dc: dc['neg'])
tfidf_df['pos_sentiment'] = tfidf_df.sentiment.apply(lambda dc: dc['pos'])
tfidf_df['neu_sentiment'] = tfidf_df.sentiment.apply(lambda dc: dc['neu'])
tfidf_df['comp_sentiment'] = tfidf_df.sentiment.apply(lambda dc: dc['compound'])

tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize)
tfidf_X = tfidf.fit_transform(tfidf_df['lemma_text']).toarray()

tfidf_val_df = pd.DataFrame(tfidf_X, columns=tfidf.get_feature_names_out())

tfidf_val_df['label'] = tfidf_df.label
tfidf_val_df['pos_sentiment'] = tfidf_df['pos_sentiment']
tfidf_val_df['neg_sentiment'] = tfidf_df['neg_sentiment']
tfidf_val_df['neu_sentiment'] = tfidf_df['neu_sentiment']
tfidf_val_df['comp_sentiment'] = tfidf_df['comp_sentiment']
tfidf_val_df['rating_sentiment_diff'] = np.abs(reviews_df['rating'] - tfidf_df['pos_sentiment'] * 5)

tfidf_X = tfidf_val_df.drop(columns=['label'])
y = tfidf_val_df.label
  
X_train, X_test, y_train, y_test = train_test_split(tfidf_X, y, test_size=0.2, random_state=42)



In [40]:
def run_ensemble(models, X_train, y_train, X_test, y_test, weights):
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.fit_transform(y_test)

    mdl_preds = []
    for i, model in enumerate(models, 1):
        model.fit(X_train, y_train_encoded)
        preds = np.array(model.predict(X_test))
        print(f'Model {i} Accuracy: {np.mean(preds == y_test_encoded)}')

        mdl_preds.append(np.array(preds))
    
    tot_preds = np.zeros(X_test.shape[0], dtype=float)
    for i, pred in enumerate(mdl_preds):
        tot_preds += weights[i] * pred

    final_preds = np.where(tot_preds > 0.5, 1, 0)
    print(f'Ensemble Accuracy: {np.mean(final_preds == y_test_encoded)}')

    return label_encoder.inverse_transform(final_preds)

In [39]:
Best_Parameters_XGB= {'colsample_bytree': 0.8, 'gamma': 2, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8}
Best_Parameters_CB= {'depth': 8, 'iterations': 800, 'l2_leaf_reg': 3, 'learning_rate': 0.1}

models = [
   lgb.LGBMClassifier(objective='binary', metric='binary_logloss', learning_rate= 0.05, max_depth=-1, n_estimators=800, num_leaves=63, verbose=0),
   XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', **Best_Parameters_XGB),
   CatBoostClassifier(task_type='CPU', verbose=0, **Best_Parameters_CB)
]

run_ensemble(models, X_train, y_train, X_test, y_test, weights=[1/3, 1/3, 1/3])

Model 1 Accuracy: 0.8861135155187337
Model 2 Accuracy: 0.8590330159515271
Model 3 Accuracy: 0.8739953011005317
Ensemble Accuracy:0.880178063558798


array(['CG', 'CG', 'OR', ..., 'CG', 'OR', 'OR'], dtype=object)

In [46]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0

lgb_weights = np.random.uniform(0.33, 0.5, 6)
best_acc = -1
best_weight = []

for i, w in enumerate(lgb_weights, 1):
    final_accs = []
    weights = [w, (1 - w) / 2, (1 - w) / 2]
    print(f'Starting CV with weights ({i} / {len(lgb_weights)}): {weights}')
    for train_index, val_index in kf.split(X_train):
        fold += 1
        train_X = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]

        val_X = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]

        print(f'Fold {fold}')
        final_preds = run_ensemble(models, train_X, train_y, val_X, val_y, weights)
        print('-----------------------------------------------------')
        
        final_accs.append(np.mean(final_preds == val_y))

    print()
    mean_acc = np.mean(final_accs)
    print(f'CV {i} Mean Accuracy: {mean_acc}')
    
    if mean_acc > best_acc:
        best_acc = mean_acc
        best_weight = weights
    
    print(f'Overall Best Accuracy: {best_acc} with weights = {best_weight}')

    print()

Starting CV with weights (1 / 6): [0.3670983510672348, 0.3164508244663826, 0.3164508244663826]
Fold 1
Model 1 Accuracy: 0.8752511980213326
Model 2 Accuracy: 0.8441799350749729
Model 3 Accuracy: 0.8610295254289689
Ensemble Accuracy: 0.8682949451228938
-----------------------------------------------------
Fold 2
Model 1 Accuracy: 0.8735507806461587
Model 2 Accuracy: 0.8554645231102179
Model 3 Accuracy: 0.863193692997372
Ensemble Accuracy: 0.8679857783274076
-----------------------------------------------------
Fold 3
Model 1 Accuracy: 0.8789611995671665
Model 2 Accuracy: 0.8568557736899057
Model 3 Accuracy: 0.8727778636574431
Ensemble Accuracy: 0.8752511980213326
-----------------------------------------------------
Fold 4
Model 1 Accuracy: 0.8735507806461587
Model 2 Accuracy: 0.8540732725305302
Model 3 Accuracy: 0.8704591126912969
Ensemble Accuracy: 0.874169114237131
-----------------------------------------------------
Fold 5
Model 1 Accuracy: 0.8715411964754985
Model 2 Accuracy: 0.846

KeyboardInterrupt: 