In [156]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [157]:
ColOfDataset = [
    "title",
    "text",
    "language",
    "spam_score",
    "replies_count",
    "participants_count",
    "likes",
    "comments",
    "shares",
    "type"]
df = pd.read_csv("fake_news_dataset.csv",usecols = ColOfDataset)
df.head()

Unnamed: 0,title,text,language,spam_score,replies_count,participants_count,likes,comments,shares,type
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,0.0,0,1,0,0,0,bias
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,0.0,0,1,0,0,0,bias
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,0.0,0,1,0,0,0,bias
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,0.068,0,0,0,0,0,bias
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,0.865,0,0,0,0,0,bias


In [158]:
set(df["type"]) # Fake news is defines by type = fake

{'bias', 'bs', 'conspiracy', 'fake', 'hate', 'junksci', 'satire', 'state'}

In [159]:
print(set(df["language"])) #multiple language news is present

{'ignore', 'russian', 'portuguese', 'norwegian', 'english', 'greek', 'italian', 'turkish', 'polish', 'arabic', 'dutch', 'finnish', 'spanish', 'chinese', 'german', 'french'}


In [160]:
#using only english news
df_eng = df[df["language"] == "english"]
df_eng = df_eng.dropna()
df_eng = df_eng.drop("language", axis=1)
df_eng.head()

Unnamed: 0,title,text,spam_score,replies_count,participants_count,likes,comments,shares,type
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,0.0,0,1,0,0,0,bias
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,0.0,0,1,0,0,0,bias
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,0.0,0,1,0,0,0,bias
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,0.068,0,0,0,0,0,bias
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0.865,0,0,0,0,0,bias


In [161]:
# Define a convenience function to convert categorical features into numerical
features = 0
FeatureMap = {}
def add_feature(name):
    if name not in FeatureMap:
        global features
        FeatureMap[name] = features
        features += 1

add_feature("fake")
add_feature("real")
print(FeatureMap)

{'fake': 0, 'real': 1}


In [162]:
# Function to convert type column to fake and real
def ArticleType(row):
    if row["type"] == "fake":
        return FeatureMap["fake"]
    else:
        return FeatureMap["real"]

#type = 1 => real news
#type = 0 => fake news
df_eng["type"] = df_eng.apply(ArticleType, axis=1)
df_eng.head()

Unnamed: 0,title,text,spam_score,replies_count,participants_count,likes,comments,shares,type
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,0.0,0,1,0,0,0,1
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,0.0,0,1,0,0,0,1
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,0.0,0,1,0,0,0,1
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,0.068,0,0,0,0,0,1
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,0.865,0,0,0,0,0,1


In [163]:
trainSet,testSet = train_test_split(df_eng,test_size=0.2)

In [164]:
VectorizerText = TfidfVectorizer()
VectorizerTitle = TfidfVectorizer()

In [165]:
vectorized_text = VectorizerText.fit_transform(trainSet.pop("text").values)

In [166]:
vectorized_title = VectorizerTitle.fit_transform(trainSet.pop("title").values)

In [167]:
from scipy import sparse

spam_score_train = sparse.csr_matrix(trainSet["spam_score"].values).transpose()
replies_count_train = sparse.csr_matrix(trainSet["replies_count"].values).transpose()
participants_count_train = sparse.csr_matrix(trainSet["participants_count"].values).transpose()
likes_train = sparse.csr_matrix(trainSet["likes"].values).transpose()
comments_train = sparse.csr_matrix(trainSet["comments"].values).transpose()
shares_train = sparse.csr_matrix(trainSet["shares"].values).transpose()

In [168]:
from scipy.sparse import hstack

X_train = hstack(
    [
        vectorized_text,
        vectorized_title,
        spam_score_train,
        replies_count_train,
        participants_count_train,
        likes_train,
        comments_train,
        shares_train,
    ]
)
y_train = trainSet.pop("type").values

In [169]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [170]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [171]:
vectorized_text_test = VectorizerText.transform(testSet.pop("text").values)
vectorized_title_test = VectorizerTitle.transform(testSet.pop("title").values)

In [172]:
spam_score_test = sparse.csr_matrix(testSet["spam_score"].values).transpose()
replies_count_test = sparse.csr_matrix(testSet["replies_count"].values).transpose()
participants_count_test = sparse.csr_matrix(testSet["participants_count"].values).transpose()
likes_test = sparse.csr_matrix(testSet["likes"].values).transpose()
comments_test = sparse.csr_matrix(testSet["comments"].values).transpose()
shares_test = sparse.csr_matrix(testSet["shares"].values).transpose()

In [173]:
X_test = hstack(
    [
        vectorized_text_test,
        vectorized_title_test,
        spam_score_test,
        replies_count_test,
        participants_count_test,
        likes_test,
        comments_test,
        shares_test,
    ]
)
y_test = testSet.pop("type").values

In [174]:
clf.score(X_test, y_test)

0.997431506849315

In [185]:
df.loc[(df['type'] == 'fake')]

Unnamed: 0,title,text,language,spam_score,replies_count,participants_count,likes,comments,shares,type
57,The Amish In America Commit Their Vote To Dona...,18 SHARE The Amish in America have committed t...,english,0.0,0,0,0,0,0,fake
58,Obama Signs Executive Order Declaring Investig...,64 SHARE President Obama has signed an Executi...,english,0.009,0,0,0,0,0,fake
379,Comment on HALLOWEEN IN THE CASTRO RETURNS IN ...,"adobochron 10 Comments \nSan Francisco, Califo...",english,0.0,0,1,0,0,0,fake
380,Comment on Tutorial: Riding The Philippine Jee...,"adobochron 1 Comment \nMANILA, Philippines (Th...",english,0.0,0,1,0,0,0,fake
381,Comment on What White House Executive Chef Com...,"adobochron 1 Comment Comerford \nWASHINGTON, D...",english,0.0,0,1,0,0,0,fake
382,Comment on Philippines Voids Building Permit O...,adobochron 7 Comments A rendering of the Trump...,english,0.0,0,1,0,0,0,fake
383,Comment on Hillary Clinton Campaign Logo Has A...,"adobochron 1 Comment \nSAN FRANCISCO, Californ...",english,0.0,0,1,0,0,0,fake
384,Comment on Philippine Government To ‘Take Back...,"adobochron 2 Comments \nMANILA, Philippines ( ...",english,0.0,0,1,0,0,0,fake
385,"Comment on If Elected President, Donald Trump ...",adobochron 2 Comments The Trump hotels in Las ...,english,0.0,0,1,0,0,0,fake
386,Comment on WHITE HOUSE EXECUTIVE CHEF REVEALS ...,"adobochron 3 Comments Comerford \nWashington, ...",english,0.0,0,1,0,0,0,fake
