In [5]:
import numpy as np
import pandas as pd
import re
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Data pre-processing**

In [16]:
#load data
new_data = pd.read_csv('/content/train.csv', on_bad_lines='skip', engine='python')

In [17]:
new_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [18]:
new_data.shape

(7352, 5)

In [19]:
#counting missing value
new_data.isnull().sum()

Unnamed: 0,0
id,0
title,205
author,698
text,14
label,0


In [22]:
#replace with empty string
#new_data = new_data.fillna('')
#imputing
from sklearn.impute import SimpleImputer
# Create a SimpleImputer instance with the 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')
# Apply the imputer to fill missing values in the 'title' column
new_data['title'] = imputer.fit_transform(new_data[['title']]).ravel()
# Apply the imputer to fill missing values in the 'author' column
new_data['author'] = imputer.fit_transform(new_data[['author']]).ravel()


In [23]:
new_data.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,14
label,0


In [24]:
new_data.dropna(inplace=True)

In [26]:
new_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [27]:
#create new_column
new_data['content'] = new_data['author']+' '+new_data['title']

In [28]:
print(new_data['content'])

0       Darrell Lucus House Dem Aide: We Didn’t Even S...
1       Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2       Consortiumnews.com Why the Truth Might Get You...
3       Jessica Purkiss 15 Civilians Killed In Single ...
4       Howard Portnoy Iranian woman jailed for fictio...
                              ...                        
7347    Christopher Mele How to Save Money (and Hassle...
7348    Jeff Poor Bill Cosby Spokesman to Gloria Allre...
7349    Nicholas Fandos Trump Calls Press ‘Dishonest,’...
7350       The Saker CrossTalk: Bullhorns Electioneering!
7351    Breitbart London Barcelona: Police Open Fire o...
Name: content, Length: 7338, dtype: object


In [29]:
#separating data
X = new_data.drop(columns='label', axis=1)
Y = new_data['label']

In [30]:
print(X)
print(Y)

        id                                              title  \
0        0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1        1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2        2                  Why the Truth Might Get You Fired   
3        3  15 Civilians Killed In Single US Airstrike Hav...   
4        4  Iranian woman jailed for fictional unpublished...   
...    ...                                                ...   
7347  7347  How to Save Money (and Hassles) on Your Black ...   
7348  7348  Bill Cosby Spokesman to Gloria Allred: ’Go Bac...   
7349  7349  Trump Calls Press ‘Dishonest,’ Then Utters Fal...   
7350  7350               CrossTalk: Bullhorns Electioneering!   
7351  7351  Barcelona: Police Open Fire on Stolen Gas Truc...   

                  author                                               text  \
0          Darrell Lucus  House Dem Aide: We Didn’t Even See Comey’s Let...   
1        Daniel J. Flynn  Ever get the feeling your life circ

In [33]:
X.drop('id',axis=1,inplace=True)

**Stemming**

In [34]:
port=PorterStemmer()


In [36]:
def stemming(content):
  stemmed = re.sub('[^a-zA-Z]',' ',content) #remove all punctuation
  stemmed = stemmed.lower() #miniscule
  stemmed = stemmed.split()
  stemmed = [port.stem(word) for word in stemmed if not word in stopwords.words('english')]
  stemmed = ' '.join(stemmed)
  return stemmed


In [37]:
new_data['content'] = new_data['content'].apply(stemming)

In [38]:
print(new_data['content'])

0       darrel lucu hous dem aid even see comey letter...
1       daniel j flynn flynn hillari clinton big woman...
2                  consortiumnew com truth might get fire
3       jessica purkiss civilian kill singl us airstri...
4       howard portnoy iranian woman jail fiction unpu...
                              ...                        
7347    christoph mele save money hassl black friday s...
7348    jeff poor bill cosbi spokesman gloria allr go ...
7349    nichola fando trump call press dishonest utter...
7350                    saker crosstalk bullhorn election
7351    breitbart london barcelona polic open fire sto...
Name: content, Length: 7338, dtype: object


In [39]:
#we use only content
X=new_data['content'].values
Y=new_data['label'].values

In [40]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'nichola fando trump call press dishonest utter falsehood new york time'
 'saker crosstalk bullhorn election'
 'breitbart london barcelona polic open fire stolen ga truck speed wrong way street']


In [41]:
print(Y)

[1 0 1 ... 0 1 0]


In [42]:
Y.shape

(7338,)

**convert text into matrix**

In [43]:
tf=TfidfVectorizer()
X=tf.fit_transform(X)

In [45]:
print(X)

  (0, 2295)	0.34603928837387665
  (0, 5600)	0.34603928837387665
  (0, 4417)	0.222432377802365
  (0, 2419)	0.26914716989384746
  (0, 184)	0.273612557362823
  (0, 3142)	0.23794750078734786
  (0, 8418)	0.2605146859666486
  (0, 1885)	0.24650426783674267
  (0, 5423)	0.29244185722461014
  (0, 4842)	0.25258381943244756
  (0, 1585)	0.3828961821148579
  (0, 9810)	0.28731837346213057
  (1, 2275)	0.27268888090869137
  (1, 3492)	0.7224360358196773
  (1, 4307)	0.18983646939465887
  (1, 1788)	0.18949770627065488
  (1, 968)	0.286402912300127
  (1, 10451)	0.28966582126173024
  (1, 1427)	0.3779546605663311
  (1, 1214)	0.15164775289732926
  (2, 1989)	0.4613645235153618
  (2, 1872)	0.3201007240203953
  (2, 9765)	0.42086170284649527
  (2, 6034)	0.4919860347112601
  (2, 3782)	0.34366624258290107
  :	:
  (7335, 9561)	0.10513349330168473
  (7335, 7369)	0.2931506452790562
  (7335, 1405)	0.25025124427531387
  (7335, 6480)	0.2941742057033471
  (7335, 3280)	0.36054856792077794
  (7335, 3271)	0.4279464905624996
 

**Splitting**

In [46]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [47]:
lg=LogisticRegression()
model=lg.fit(x_train,y_train)



In [50]:
x_train_pred=model.predict(x_train)
training_data_accuracy=accuracy_score(x_train_pred,y_train)
print(training_data_accuracy)

0.9841567291311755


In [48]:
y_pred= model.predict(x_test)

In [51]:
print(y_pred)
accuracy_score(y_test,y_pred)

[0 1 1 ... 1 1 0]


0.9564032697547684

In [52]:
#show the confusion matrix
confusion_matrix(y_test,y_pred)

array([[678,  57],
       [  7, 726]])

In [53]:
#show the rapport
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95       735
           1       0.93      0.99      0.96       733

    accuracy                           0.96      1468
   macro avg       0.96      0.96      0.96      1468
weighted avg       0.96      0.96      0.96      1468



**Making predective system**

In [56]:
def prediction(input_data):
  input_data = stemming(input_data)
  input_data = tf.transform([input_data])
  prediction = model.predict(input_data)
  if prediction[0] == 0:
    print('The news is Real')
  else:
    print('The news is Fake')

In [59]:
prediction("Rebecca Solnit :Les incendies de forêt en Californie continuent de ravager des milliers d’hectares, forçant des milliers de résidents à évacuer leurs maisons. Les pompiers luttent contre les flammes depuis plusieurs jours, mais les conditions météorologiques difficiles compliquent leurs efforts. Les autorités locales ont déclaré l’état d’urgence et demandent aux habitants de suivre les consignes d’évacuation pour assurer leur sécurité.")

The news is Fake
