#**Downloading dataset from kaggle to colab**


**upload json file**

In [1]:
# install en_core_web_lg
!spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 75 bytes


In [3]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
 93% 38.0M/41.0M [00:01<00:00, 34.4MB/s]
100% 41.0M/41.0M [00:01<00:00, 23.8MB/s]


In [4]:
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


#**Fake News Classification**

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
# read dataset
Fake_News = pd.read_csv('/content/Fake.csv')[['text']]
True_News = pd.read_csv('/content/True.csv')[['text']]
Fake_News['target'] = 0
True_News['target'] = 1
# concat Fake_News and True News
data = pd.concat([Fake_News, True_News])

In [7]:
data.head()

Unnamed: 0,text,target
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [8]:
# shuffle dataset
data = data.sample(frac=1).reset_index(drop=True)

In [9]:
data.head()

Unnamed: 0,text,target
0,WARNING: Disturbing image of a man being shot....,0
1,Sean Hannity is definitely going to blow a gas...,0
2,JERUSALEM (Reuters) - U.S. President Donald Tr...,1
3,CARACAS (Reuters) - Former Argentine soccer gr...,1
4,"21st Century Wire says Incredibly, on the same...",0


In [10]:
print(f"Dataset Shape: {data.shape}")

Dataset Shape: (44898, 2)


In [11]:
# figure out if the class if balance or not and replace 0 with Fake_News and 1 with True_News
data['target'].value_counts()

target
0    23481
1    21417
Name: count, dtype: int64

**preprocessing Text using Spacy**

In [12]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [13]:
# Convert Column "text" into vector
data['text'] = data['text'].apply(lambda x: nlp(x).vector)

In [16]:
data.head()

Unnamed: 0,text,target
0,"[-2.160595, 0.9243787, -2.2421672, 0.99040425,...",0
1,"[-1.4992006, 1.388146, -2.5538058, -0.02530843...",0
2,"[-1.9733213, 1.2886957, -1.3022742, 1.3254069,...",1
3,"[-1.7985013, 0.26322412, -1.6491295, 0.5489365...",1
4,"[-1.6356444, 0.29413742, -1.5767752, 0.5951295...",0


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'].values, data['target'],
                                                    test_size=0.2,stratify=data['target'],random_state=42)

In [19]:
# Convert X_train, X_test into 2D array :)
x_train_Stack = np.stack(X_train)
x_test_Stack = np.stack(X_test)

**Train Naive Bayes Model**

MultinomialNB model doesn't deal with negative values so u have to scale train & test dataset :)

**Attempt 1**

In [21]:
# MultinomialNB model doesn't deal with negative values so u have to scale train & test dataset :):
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_Stack = scaler.fit_transform(x_train_Stack)
x_test_Stack = scaler.transform(x_test_Stack)

In [22]:
# Train Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_Stack, y_train)

In [23]:
# predict X_test
y_pred = model.predict(x_test_Stack)

In [24]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.85      0.87      4696
           1       0.84      0.89      0.87      4284

    accuracy                           0.87      8980
   macro avg       0.87      0.87      0.87      8980
weighted avg       0.87      0.87      0.87      8980



**Attempt 2**

In [27]:
# train KNN model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(x_train_Stack, y_train)

In [28]:
# predict
y_pred = model.predict(x_test_Stack)

In [29]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4696
           1       0.97      0.97      0.97      4284

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980

