In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

In [2]:
df =pd.read_csv("news.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df.shape

(6335, 4)

In [6]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE


In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [8]:
df.label = le.fit_transform(df.label)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [10]:
y = df.label
 
df.drop("label", axis=1) 
 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [11]:
count_vectorizer = CountVectorizer(stop_words='english') 

count_train = count_vectorizer.fit_transform(X_train) 

count_test = count_vectorizer.transform(X_test)

In [12]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

tfidf_test = tfidf_vectorizer.transform(X_test)

In [13]:
count_train.A.shape

(4244, 56922)

In [14]:
len(count_vectorizer.get_feature_names())

56922

#### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression() 


In [16]:
clf.fit(tfidf_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
tfidf_train.shape

(4244, 56922)

In [18]:
y_train.shape,tfidf_test.shape

((4244,), (2091, 56922))

In [19]:
pred = clf.predict(tfidf_test)

In [20]:
from sklearn.metrics import accuracy_score
print(" Acuracy is",accuracy_score(y_test, pred, normalize=True, sample_weight=None)*100,"%")

 Acuracy is 91.39167862266858 %


In [21]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred))

[[962  46]
 [134 949]]


##### Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
clf1=RandomForestClassifier()
clf1.fit(tfidf_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
pred = clf1.predict(tfidf_test)
from sklearn.metrics import accuracy_score
print(" Acuracy is",accuracy_score(y_test, pred, normalize=True, sample_weight=None)*100,"%")

 Acuracy is 91.10473457675752 %


In [24]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred))

[[923  85]
 [101 982]]


In [25]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier()

In [26]:
linear_clf = PassiveAggressiveClassifier()
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:", score*100)

accuracy: 93.87852702056433


In [28]:
print(confusion_matrix(y_test,pred))

[[ 956   52]
 [  76 1007]]
