In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [6]:
df = pd.read_csv('../../datasets/Fake_finder/clean_text.csv')

df.drop(columns = 'Unnamed: 0', inplace = True)

df.isna().mean()

clean_text    0.004604
label         0.000000
dtype: float64

In [7]:
df.dropna(inplace = True)

In [8]:
tf = TfidfVectorizer(max_df=0.8, min_df = 3, stop_words = 'english', ngram_range=(1,2))

X = tf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [None]:
xgb = xgb.XGBClassifier(max_depth=8, n_jobs = -1, verbosity=1)

xgb.fit(X_train, y_train)

In [None]:
print(f'Training score is {xgb.score(X_train, y_train)}')
print(f'Test score is {xgb.score(X_test, y_test)}')

In [None]:
probs = xgb.predict_proba(X)
pred_label = xgb.predict(X)

df['pred'] = pred_label


prob_true = []
prob_fake = []

for i in range(len(df.index)):
    prob_true.append(probs[i][0])
    prob_fake.append(probs[i][1])

df['prob_true'] = prob_true
df['prob_fake'] = prob_fake

In [None]:
print(f'The f1 score is {f1_score(df["label"], df["pred"])}')

print()

print(f'The precision score is {precision_score(df["label"], df["pred"])}')


print(f'The recall score is {recall_score(df["label"], df["pred"])}')