<a href="https://colab.research.google.com/github/Neha-X-code/Code.Care/blob/main/code_care.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# loading the dataset to a pandas DataFrame
true_dataset = pd.read_csv('/content/true.csv')
fake_dataset = pd.read_csv('/content/fake.csv')

In [None]:
true_content = true_dataset['Title'] + ' ' + true_dataset['Content']
fake_content = fake_dataset['Title'] + ' ' + fake_dataset['Content']

In [None]:
# Creating the merged dataset
news_dataset = pd.DataFrame({
    'content': pd.concat([true_content, fake_content], ignore_index=True),
    'label': [1] * len(true_content) + [0] * len(fake_content)
})

In [None]:
news_dataset.shape

(10254, 2)

In [None]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,content,label
0,Geriatrician and author louise aronson you can...,1
1,White house hopes to sign reg-reform before st...,1
2,Coronavirus hasn't stopped groups from gatheri...,1
3,Cantor seeking deal on dod cuts. . . report: d...,1
4,Mamata to hold review meeting on healthcare ev...,1


In [None]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

content    75
label       0
dtype: int64

In [None]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [None]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [None]:
print(X)
print(Y)

                                                 content
0      Geriatrician and author louise aronson you can...
1      White house hopes to sign reg-reform before st...
2      Coronavirus hasn't stopped groups from gatheri...
3      Cantor seeking deal on dod cuts. . . report: d...
4      Mamata to hold review meeting on healthcare ev...
...                                                  ...
10249  Jewish cnn claims covid-19 will surge among un...
10250  Covid news: pfizer and biontech are developing...
10251  End-life crisis marked by extravagant spending...
10252  More people are registering republican than de...
10253  Excited patient points out organ he wants from...

[10254 rows x 1 columns]
0        1
1        1
2        1
3        1
4        1
        ..
10249    0
10250    0
10251    0
10252    0
10253    0
Name: label, Length: 10254, dtype: int64


In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_dataset['content'].head(500).apply(stemming)

0      geriatrician author louis aronson heal import ...
1      white hous hope sign reg reform state union en...
2      coronaviru stop group gather led grim consequ ...
3      cantor seek deal dod cut report dod review sen...
4      mamata hold review meet healthcar everi day we...
                             ...                        
495    everyon need mediterranean dish eid al fitr ce...
496    india may rais vaccin spend billion fiscal yea...
497    india hurri teacher vaccin physic class resum ...
498    cash shortag threaten white hous global vaccin...
499    search consult associ hire new chief archambea...
Name: content, Length: 500, dtype: object

In [None]:
print(news_dataset['content'])

0        Geriatrician and author louise aronson you can...
1        White house hopes to sign reg-reform before st...
2        Coronavirus hasn't stopped groups from gatheri...
3        Cantor seeking deal on dod cuts. . . report: d...
4        Mamata to hold review meeting on healthcare ev...
                               ...                        
10249    Jewish cnn claims covid-19 will surge among un...
10250    Covid news: pfizer and biontech are developing...
10251    End-life crisis marked by extravagant spending...
10252    More people are registering republican than de...
10253    Excited patient points out organ he wants from...
Name: content, Length: 10254, dtype: object


In [None]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [None]:
print(X)

['Geriatrician and author louise aronson you can’t heal, you can’t have an important therapeutic relationship in 10 minutes. So they are setting up patients for bad care and clinicians to burn out. Most people enter medicine to take good care of patients. The level of moral distress is overwhelming if you can’t do that. I have done this crazy thing where i have a less than full time job that allows me to schedule extra time so i can pay attention to my patients in the office and later write a good note. Otherwise i would have had to stop seeing patients. For a long time i wrote two notes in one  one for billing, one with important clinical information. It was insanity. Now it’s a bit better. I do only one note, but it still has a lot of irrelevant information and it takes a lot of time. I can see another provider’s note, which is great … sometimes. Yesterday i saw a neurosurgical consult from a patient with a new brain lesion. In the free text, it listed a variety of signs and importan

In [None]:
print(Y)

[1 1 1 ... 0 0 0]


In [None]:
Y.shape

(10254,)

In [None]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

  (0, 94999)	0.0805817031146481
  (0, 94970)	0.1748788168393942
  (0, 94838)	0.014941473035052772
  (0, 94828)	0.026176302485790358
  (0, 94785)	0.02315727966470167
  (0, 94717)	0.012980548653858916
  (0, 94687)	0.022113448143825872
  (0, 94332)	0.017466124393296388
  (0, 94315)	0.02537037475197131
  (0, 94299)	0.02472452954815491
  (0, 94200)	0.02276422508388292
  (0, 94197)	0.02938746658037994
  (0, 94155)	0.021983939859587954
  (0, 94054)	0.01922979444916671
  (0, 94036)	0.012691656241142307
  (0, 93910)	0.02159916637306125
  (0, 93755)	0.016775933784176352
  (0, 93729)	0.04243474006096005
  (0, 93284)	0.015133643508909554
  (0, 93235)	0.042475306231581245
  (0, 93099)	0.03764772412033096
  (0, 93082)	0.04878130998807524
  (0, 93078)	0.0498076691481274
  (0, 93036)	0.020343998385065808
  (0, 92893)	0.019962832843945188
  :	:
  (10253, 30633)	0.10064147166957964
  (10253, 30631)	0.08173256561084306
  (10253, 30601)	0.07404929809593104
  (10253, 30174)	0.06378982884297757
  (10253, 26

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

In [None]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9414848226258686


In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9000487567040468


In [None]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The article is Real')
else:
  print('The article is Fake')

[1]
The article is Fake


In [None]:
print(Y_test[3])

1


In [None]:
# Selecting a few more test data points
X_new_1 = X_test[10]
X_new_2 = X_test[20]
X_new_3 = X_test[30]

# Making predictions on the new data points
prediction_1 = model.predict(X_new_1.reshape(1, -1))
prediction_2 = model.predict(X_new_2.reshape(1, -1))
prediction_3 = model.predict(X_new_3.reshape(1, -1))

# Printing the predictions
if prediction_1[0] == 0:
    print('The article 1 is Real')
else:
    print('The article 1 is Fake')

if prediction_2[0] == 0:
    print('The article 2 is Real')
else:
    print('The article 2 is Fake')

if prediction_3[0] == 0:
    print('The article 3 is Real')
else:
    print('The article 3 is Fake')

# Printing the true labels for comparison
print(Y_test[10])
print(Y_test[20])
print(Y_test[30])

The article 1 is Fake
The article 2 is Real
The article 3 is Fake
1
0
1


In [None]:
# User-defined test case
test_headline = "Geriatrician and author louise aronson"

# Preprocess the test headline
test_input = stemming(test_headline)
test_input_transformed = vectorizer.transform([test_input])

# Make prediction
prediction = model.predict(test_input_transformed)

if prediction[0] == 0:
    print("The article is Fake")
else:
    print("The article is Real")


The article is Fake
