In [7]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('../../datasets/Fake_finder/clean_text.csv')

df.drop(columns = 'Unnamed: 0', inplace = True)

df.dropna(inplace = True)

In [4]:
tf = TfidfVectorizer(max_df=0.8, min_df = 3, stop_words = 'english', ngram_range=(1,2))

X = tf.fit_transform(df['clean_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [6]:
rf = RandomForestClassifier(max_depth = 8, n_jobs=-1, verbose = 1)

rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [10]:
cross_val_score(rf, X_train, y_train, n_jobs= -1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    5.5s finished


0.720453274043939

In [18]:
cross_val_score(rf, X_test, y_test, n_jobs= -1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.5s finished


0.7196195597798899

In [19]:
words = tf.vocabulary_.keys()

In [22]:
feature_df = pd.DataFrame(words)

In [23]:
feature_df['Importance'] = rf.feature_importances_

In [24]:
feature_df.columns= ['Word', 'Importance']

In [25]:
feature_df.sort_values(by = 'Importance', ascending= False)

Unnamed: 0,Word,Importance
201123,used building,0.029989
321980,11 began,0.027042
445886,received notification,0.023532
416570,seeking unfiltered,0.022224
290068,supporting government,0.019550
...,...,...
182155,using american,0.000000
182154,east using,0.000000
182153,trying sort,0.000000
182152,devoted trying,0.000000
