# Objective:
* To develop a machine learning model that can classify news articles as either fake (0) or real (1) based on their content (text)

# Import Libraries

In [2]:
import numpy as np 
import pandas as pd
import re 
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

# Read the file

In [3]:
df = pd.read_csv(r"E:\\NLP\\Fake news.csv",delimiter = ';',nrows = 1000)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


# To verify Null values in the file and know the information of file

In [4]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   title       1000 non-null   object
 2   text        1000 non-null   object
 3   label       1000 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 31.4+ KB


# Import Stopwords

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gkris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Lowering the text

In [9]:
df['cleaned_text'] = df['text'].str.lower()
df['cleaned_text']

0      ramallah, west bank (reuters) - palestinians s...
1      beijing (reuters) - u.s. president-elect donal...
2      while the controversy over trump s personal ta...
3      beijing (reuters) - a trip to beijing last wee...
4      there has never been a more uncourageous perso...
                             ...                        
995    before lavar ball, the ungrateful father of ac...
996    kigali (reuters) - a rwandan opposition offici...
997    (reuters) - the u.s. supreme court on monday u...
998    niamey/nairobi (reuters) - when doundou chefou...
999    new york (reuters) - a u.s. house of represent...
Name: cleaned_text, Length: 1000, dtype: object

# Removing punctuation

In [10]:
exclude = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

df['cleaned_text'] = df['text'].apply(remove_punc)
df['cleaned_text']

0      RAMALLAH West Bank Reuters  Palestinians switc...
1      BEIJING Reuters  US Presidentelect Donald Trum...
2      While the controversy over Trump s personal ta...
3      BEIJING Reuters  A trip to Beijing last week b...
4      There has never been a more UNCOURAGEOUS perso...
                             ...                        
995    Before Lavar Ball the ungrateful father of acc...
996    KIGALI Reuters  A Rwandan opposition official ...
997    Reuters  The US Supreme Court on Monday upheld...
998    NIAMEYNAIROBI Reuters  When Doundou Chefou fir...
999    NEW YORK Reuters  A US House of Representative...
Name: cleaned_text, Length: 1000, dtype: object

# Performing Stemming

In [11]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])  # Use 'word' instead of reusing 'text'
df['cleaned_text'] = df['cleaned_text'].apply(stem_words)
df['cleaned_text']

0      ramallah west bank reuter palestinian switch o...
1      beij reuter us presidentelect donald trump’ ca...
2      while the controversi over trump s person tax ...
3      beij reuter a trip to beij last week by zimbab...
4      there ha never been a more uncourag person in ...
                             ...                        
995    befor lavar ball the ungrat father of accus cr...
996    kigali reuter a rwandan opposit offici and eig...
997    reuter the us suprem court on monday upheld fe...
998    niameynairobi reuter when doundou chefou first...
999    new york reuter a us hous of repres committe l...
Name: cleaned_text, Length: 1000, dtype: object

In [12]:
df['cleaned_text'][0]

'ramallah west bank reuter palestinian switch off christma light at jesu tradit birthplac in bethlehem on wednesday night in protest at us presid donald trump s decis to recogn jerusalem as israel s capit a christma tree adorn with light outsid bethlehem s church of the nativ where christian believ jesu wa born and anoth in ramallah next to the burial site of former palestinian leader yasser arafat were plung into dark the christma tree wa switch off on the order of the mayor today in protest at trump s decis said fadi ghatta bethlehem s municip media offic he said it wa unclear whether the illumin would be turn on again befor the main christma festiv in a speech in washington trump said he had decid to recogn jerusalem as israel s capit and move the us embassi to the citi isra prime minist benjamin netanyahu said trump s move mark the begin of a new approach to the israelipalestinian conflict and said it wa an histor landmark arab and muslim across the middl east condemn the us decis 

# Converting Dependent and Independent variable

In [13]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df['label']

0      1
1      1
2      0
3      1
4      0
      ..
995    0
996    1
997    1
998    1
999    1
Name: label, Length: 1000, dtype: int64

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

# Performing Metics

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y, test_size = 0.2, random_state=42)

In [16]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 158860 stored elements and shape (800, 20517)>

In [17]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 41667 stored elements and shape (200, 20517)>

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,Y_train)

In [19]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
y_pred =model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test,y_pred))
print("Classification Report:",classification_report(Y_test,y_pred))
print("Confusion Matrix:", confusion_matrix(Y_test, y_pred))

Accuracy: 0.91
Classification Report:               precision    recall  f1-score   support

           0       0.95      0.85      0.90        91
           1       0.88      0.96      0.92       109

    accuracy                           0.91       200
   macro avg       0.92      0.90      0.91       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix: [[ 77  14]
 [  4 105]]
