In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC


## read Dataset

In [22]:
df = pd.read_csv("fake_or_real_news.csv")
df

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


## Text Encoding
rather than working with strings, the machine learning model works with numerical values so we replace 'fake' with 0 and 'real' with 1  

In [23]:
d={"FAKE":0 , "REAL" :1}
df["label"]=df["label"].replace(d)

## After Encoding



In [70]:
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


## Assigning Data to X , y

In [25]:
X ,y =df["text"] , df["label"]

In [26]:
y # display the label y

0       0
1       0
2       1
3       0
4       1
       ..
6330    1
6331    0
6332    0
6333    1
6334    1
Name: label, Length: 6335, dtype: int64

## Splitting The Data Into Train and Test
splitting the data into train and test with a precentage of 80% to train and 20% test

In [27]:
X_train,X_test ,y_train,y_test =train_test_split(X,y,test_size =0.2)

In [52]:
X_test # display the test data

2180    Kenyan refugee kills co-worker, self 3 others ...
1999    As demonstrations erupted in Cologne on Saturd...
792       Sean Adl-Tabatabai in News , World // 0 Comm...
5381    There is an path for Democrats to regain the p...
346     Podesta wiki leaks...We prefer Muslims over Ch...
                              ...                        
619     With Trump's call for a temporary ban on Musli...
733     — Bernie Sanders (@BernieSanders) October 27, ...
500     Anonymous hacker Deric Lostutter faces 16 year...
1301    Subscribe My daughter and me \nYesterday was N...
972     Getting 10 Minutes of Sunlight Per Day Can Sto...
Name: text, Length: 1267, dtype: object

##  Convert Text Data into numerical representations
These lines of code use the TfidfVectorizer from scikit-learn to convert text data in X_train and X_test into numerical representations while ignoring common English stop words and considering a maximum document frequency of 0.7. The resulting transformed data is stored in X_train_vectorized and X_test_vectorized

In [29]:
vectorizer = TfidfVectorizer(stop_words="english" ,max_df=0.7)
X_train_vectorized =vectorizer.fit_transform(X_train)
X_test_vectorized =vectorizer.transform(X_test)

In [30]:
X_train_vectorized

<5068x61592 sparse matrix of type '<class 'numpy.float64'>'
	with 1333516 stored elements in Compressed Sparse Row format>

## calling the model and fit the Data

In [31]:
clf =LinearSVC()
clf.fit(X_train_vectorized ,y_train)

LinearSVC()

## calculateing the accuracy score of a classifier model

In [32]:
clf.score(X_test_vectorized ,y_test)

0.930544593528019

## Test the model

In [64]:
Real_text=X_test.iloc[10] # here we assign real text to variable

In [68]:
vectorized_text = vectorizer.transform([Real_text])
clf.predict(vectorized_text)    

array([1], dtype=int64)

In [67]:
fake_text =X_test.iloc[13] # here we assign fake text to variable

In [69]:
vectorized_text = vectorizer.transform([fake_text])   
clf.predict(vectorized_text)

array([0], dtype=int64)