In [109]:
from sklearn.pipeline import Pipeline

from sklearn.datasets import fetch_openml

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd


In [110]:
X1,y = fetch_openml(data_id=43592,return_X_y=True,as_frame=True,
                   parser='auto')

In [111]:
X = X1['text'].copy()
y = X1['label'].copy()

X = X.str.replace("nn"," ",regex=False)
X = X.str.replace(" n[A-Z]"," ",regex=True)

In [112]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,random_state=42,shuffle=True,test_size=.2)

In [113]:
y

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [114]:
model = Pipeline(
    steps=[
        ('binarizer',CountVectorizer(ngram_range=(1,1),analyzer='word',max_df=.8,min_df=.0005,binary=True)),
        ('learner',RandomForestClassifier(random_state=42))
    ]
).fit(Xtrain,ytrain)

In [115]:
yfit = model.predict(Xtrain)
ypred = model.predict(Xtest)

In [116]:
from sklearn.metrics import accuracy_score

In [117]:
accuracy_score(ytest,ypred)

0.9321231254932912

In [118]:
Xtest.values[0]



In [119]:
ytest

1357    FAKE
2080    FAKE
2718    FAKE
812     FAKE
4886    FAKE
        ... 
1512    FAKE
57      REAL
6092    REAL
3403    REAL
292     REAL
Name: label, Length: 1267, dtype: object

In [120]:
new_text = pd.Series(
    ['Today in Brooklyn, a man was gunned down by two armed female attackers. The assailants are still at large. Anyone with information should contact the NYPD.']
)

In [121]:
model.predict(new_text)

array(['FAKE'], dtype=object)

In [122]:
new_text = pd.Series(
    [
        """
        In his more than three decades in politics, Benjamin Netanyahu has accrued almost as many nicknames as he has election wins.

        There’s “The Magician” for his uncanny ability to grab victory from the jaws of defeat. “King Bibi” for staying atop Israeli politics longer than anyone else. And, universally, though not necessarily affectionately: plain old “Bibi”. But there is another one he revelled in, and which now appears in tatters: “Mr Security.” How did it all go so wrong?

        """
    ]
)
model.predict(new_text)

array(['FAKE'], dtype=object)

In [123]:
clf = model['learner']
vect = model['binarizer']

In [124]:
imps = pd.DataFrame(
    {'token':vect.get_feature_names_out(),
     'imp':clf.feature_importances_}
)

In [125]:
imps[imps['imp']>0].sort_values(by='imp',ascending=False).head(40)

Unnamed: 0,token,imp
18415,obama,0.008519
22473,republican,0.007886
18545,october,0.007397
307,2016,0.006738
23875,sen,0.005904
11735,gop,0.005343
22475,republicans,0.004968
27494,tuesday,0.004612
4337,candidates,0.003903
23330,said,0.003795


In [108]:
Xtrain[Xtrain.str.lower().str.contains("nthe")].values[0]

"Time: Investigating Hillary is an Attack on All Women November 1, 2016 nGood morning. It's Tuesday. nWho's up for another silly attempt to claim that Hillary Clinton is only being investigated for her rogue email setup because she's a woman? This gem comes from Robin Lakoff, a Berkeley professor in sustained incoherence and special pleading. nHillary Clintons Emailgate Is an Attack on Women n'It's not about emails; it's about public communication by a woman nI am mad. I am mad because I am scared. And if you are a woman, you should be, too. Emailgate is a bitch hunt, but the target is not Hillary Clinton. Its us. nThe only reason the whole email flap has legs is because the candidate is female. Can you imagine this happening to a man? nHis name was General Petraeus. Thank you. Have a nice day. nClinton is guilty of SWF (Speaking While Female), and emailgate is just a reminder to us all that she has no business doing what shes doing and must be punished, for the sake of all decent wome