In [4]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

stopwords = set(stopwords.words('english'))
sns.set_theme()

[nltk_data] Downloading package punkt to /Users/vikram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vikram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vikram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/vikram/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
reviews_df = pd.read_csv('../data/fake_reviews.csv')
reviews_df = reviews_df.rename(columns = {'text_': 'text'})
reviews_df['category'] = reviews_df['category'].apply(lambda s: s[:-2].replace('_', ' '))
reviews_df['rating'] = reviews_df['rating'].astype(int)

reviews_df['text_no_stop'] = reviews_df['text'].apply(lambda s: ' '.join([token for token in word_tokenize(s.lower()) if token not in stopwords]))
reviews_df['text_no_punct'] = reviews_df['text'].apply(lambda s: s.lower().translate(str.maketrans('', '', string.punctuation)))

In [6]:
def preprocess_text(s):
    tokens = word_tokenize(s.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [7]:
reviews_df['lemma_text'] = reviews_df['text_no_punct'].apply(preprocess_text)

In [8]:
reviews_df.head()

Unnamed: 0,category,rating,label,text,text_no_stop,text_no_punct,lemma_text
0,Home and Kitchen,5,CG,"Love this! Well made, sturdy, and very comfor...","love ! well made , sturdy , comfortable . love...",love this well made sturdy and very comfortab...,love well made sturdy comfortable love itvery ...
1,Home and Kitchen,5,CG,"love it, a great upgrade from the original. I...","love , great upgrade original . 've mine coupl...",love it a great upgrade from the original ive...,love great upgrade original ive mine couple year
2,Home and Kitchen,5,CG,This pillow saved my back. I love the look and...,pillow saved back . love look feel pillow .,this pillow saved my back i love the look and ...,pillow saved back love look feel pillow
3,Home and Kitchen,1,CG,"Missing information on how to use it, but it i...","missing information use , great product price !",missing information on how to use it but it is...,missing information use great product price
4,Home and Kitchen,5,CG,Very nice set. Good quality. We have had the s...,nice set . good quality . set two months,very nice set good quality we have had the set...,nice set good quality set two month


In [9]:
tfidf_df = pd.DataFrame()
tfidf_df['label'] = reviews_df['label']
tfidf_df['text'] = reviews_df['lemma_text']
tfidf_df.head()

Unnamed: 0,label,text
0,CG,love well made sturdy comfortable love itvery ...
1,CG,love great upgrade original ive mine couple year
2,CG,pillow saved back love look feel pillow
3,CG,missing information use great product price
4,CG,nice set good quality set two month


In [10]:
X = tfidf_df.drop(columns=['label'])
y = tfidf_df['label']

tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize)
tfidf_X = tfidf.fit_transform(tfidf_df['text']).toarray()

X_train, X_test, y_train, y_test = train_test_split(tfidf_X, y, test_size=0.2, random_state=42)

In [12]:
tfidf_X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.07533347, 0.        , 0.        , ..., 0.0904964 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
svm = SVC(kernel = 'rbf')
svm.fit(X_train, y_train)

In [None]:
train_preds = svm.predict(X_train)
test_preds = svm.predict(X_test)

In [35]:
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

          CG       0.97      0.97      0.97     16200
          OR       0.97      0.97      0.97     16145

    accuracy                           0.97     32345
   macro avg       0.97      0.97      0.97     32345
weighted avg       0.97      0.97      0.97     32345



In [36]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

          CG       0.89      0.86      0.88      4016
          OR       0.87      0.90      0.88      4071

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



In [None]:
train_accs = []
test_accs = []

for C in [0.001, 0.01, 0.1]:
    print(f'Started Training C = {C}')
    svm = SVC(kernel = 'rbf', C=C, verbose=True)
    svm.fit(X_train, y_train)
    print('Done Training...Starting to predict...')
    train_preds = svm.predict(X_train)
    test_preds = svm.predict(X_test)
    
    train_accs.append(np.mean((np.array(y_train) == train_preds)))
    test_accs.append(np.mean((np.array(y_test) == test_preds)))
    print(f'Training Accuracy: {train_accs[-1]}')
    print(f'Test Accuracy: {test_accs[-1]}')
    print('Done Predicting')
    
    print(f'Done with C = {C}')
    print()

Started Training C = 0.001
[LibSVM]................
*
optimization finished, #iter = 16145
obj = -31.919241, rho = -0.823455
nSV = 32290, nBSV = 32290
Total nSV = 32290
Done Training...Starting to predict...


In [13]:
lgb_mdl = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    metric='binary_logloss',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=100
)

lgb_mdl.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 16145, number of negative: 16200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.241273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157388
[LightGBM] [Info] Number of data points in the train set: 32345, number of used features: 1999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499150 -> initscore=-0.003401
[LightGBM] [Info] Start training from score -0.003401


LGBMClassifier(feature_fraction=0.9, learning_rate=0.05,
               metric='binary_logloss', objective='binary')

In [14]:
train_preds = lgb_mdl.predict(X_train)
test_preds = lgb_mdl.predict(X_test)



In [15]:
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

          CG       0.85      0.83      0.84     16200
          OR       0.83      0.85      0.84     16145

    accuracy                           0.84     32345
   macro avg       0.84      0.84      0.84     32345
weighted avg       0.84      0.84      0.84     32345



In [16]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

          CG       0.82      0.81      0.82      4016
          OR       0.81      0.83      0.82      4071

    accuracy                           0.82      8087
   macro avg       0.82      0.82      0.82      8087
weighted avg       0.82      0.82      0.82      8087

