In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
True_news = pd.read_csv('/Users/payalpamnani/Desktop/News Detection/True.csv') 
Fake_news = pd.read_csv('/Users/payalpamnani/Desktop/News Detection/Fake.csv')

In [11]:
True_news['label'] = 0 
Fake_news['label'] = 1

In [12]:
dataset1 = True_news[['text','label']]
dataset2 = Fake_news[['text','label']] 

In [13]:
dataset = pd.concat([dataset1 , dataset2])
dataset.isnull().sum() # no null values

text     0
label    0
dtype: int64

In [14]:
dataset['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [15]:
dataset = dataset.sample(frac = 1)

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/payalpamnani/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [19]:
ps = WordNetLemmatizer()

In [20]:
stopwords = stopwords.words('english')

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/payalpamnani/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [22]:
def cleaning_data(row):
# convert text to into lower case
    row = row.lower()
# this line of code only take words from text and remove number and special character using RegX
    row = re.sub('[^a-zA-Z]' , ' ' , row) 
# split the data and make token.
    token = row.split()
# lemmatize the word and remove stop words like a, an , the , is ,are ...
    news = [ps.lemmatize(word) for word in token if not word in stopwords] 
# finaly join all the token with space
    cleanned_news = ' '.join(news)
# return cleanned data
    return cleanned_news

In [23]:
dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [27]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

In [28]:
X.head()

4097     former kkk grand wizard david duke officially ...
8216     los angeles county school board caused uproar ...
9250     important protecting dignity customer shop loc...
19407    schlotsky assistant manager seguin texas gave ...
1188     former fbi director james comey fired position...
Name: text, dtype: object

In [29]:
y.head()

4097     1
8216     1
9250     1
19407    1
1188     1
Name: label, dtype: int64

In [30]:
from sklearn.model_selection import train_test_split
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)

In [31]:
vec_train_data = vectorizer.fit_transform(train_data) 
vec_train_data = vec_train_data.toarray()

In [32]:
vec_test_data = vectorizer.transform(test_data).toarray() 

In [33]:
vec_train_data.shape , vec_test_data.shape

((28000, 50000), (7000, 50000))

In [34]:
train_label.value_counts() # balanced partition

1    14687
0    13313
Name: label, dtype: int64

In [35]:
test_label.value_counts() # balanced partition

1    3667
0    3333
Name: label, dtype: int64

In [36]:
training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())
testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report 
clf = MultinomialNB()

In [39]:
clf.fit(training_data, train_label) 
y_pred = clf.predict(testing_data) 

In [40]:
pd.Series(y_pred).value_counts()

1    3713
0    3287
dtype: int64

In [41]:
test_label.value_counts()

1    3667
0    3333
Name: label, dtype: int64

In [42]:
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      3333
           1       0.95      0.96      0.95      3667

    accuracy                           0.95      7000
   macro avg       0.95      0.95      0.95      7000
weighted avg       0.95      0.95      0.95      7000



In [43]:
y_pred_train = clf.predict(training_data) 
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     13313
           1       0.96      0.96      0.96     14687

    accuracy                           0.96     28000
   macro avg       0.96      0.96      0.96     28000
weighted avg       0.96      0.96      0.96     28000



In [44]:
accuracy_score(train_label , y_pred_train)

0.95875

In [45]:
accuracy_score(test_label , y_pred)

0.9522857142857143

In [46]:
news = cleaning_data(str("CHICAGO (Reuters) - An Illinois lawmaker on Monday introduced a bill to ban the forcible removal of travelers from flights by state or local government employees after a United Airlines passenger was dragged from an aircraft last week. The Airline Passenger Protection Act, sponsored by Republican state Representative Peter Breen, came after Dr. David Dao, 69, was pulled from a United flight at Chicagoâ€TMs Oâ€TMHare International Airport to make space for four crew members. The treatment of Dao sparked international outrage, as well as multiple apologies from the carrier, and raised questions about the overbooking policies of airlines. Under Breenâ€TMs measure, passengers could not be removed from flights unless they were presenting a danger to themselves or others, an emergency was taking place or the passenger had caused a serious disturbance, according to a copy of the bill introduced in the state capital, Springfield. â€œA commercial airline that removes validly seated customers without serious cause breaches the sacred trust between passengers and their airlines,â€• the bill said. The legislation would also bar the state of Illinois from making travel arrangements, doing business with or having investments in any commercial airline that maintained a policy of removing paying passengers to make room for employees traveling on non-revenue tickets. Dao, who was traveling to Louisville, Kentucky, on April 9, suffered a broken nose, a concussion and lost two teeth when he was pulled from his seat by officers from the Chicago Department of Aviation to make room for four employees on the overbooked flight. The three officers, who have not been named, were put on paid leave last week, the department said. â€œThe treatment of the passenger in last weekâ€TMs incident at Oâ€TMHare is inexcusable and must be stopped,â€• Breen said in a statement. â€œIt reflected badly on the airline, the City of Chicago, and the State of Illinois.â€• United Chief Executive Oscar Munoz on Monday again apologized for the incident. [nL1N1HP1HM] United said on Friday itwas changing its policy on booking its flight crews onto its own planes. Thecompany did not immediately respond to a request for comment on the Illinoisbill. Lawyers for Dao have moved to preserve evidence from the flight, filing amotion to keep surveillance videos and other materials related to United Flight3411 in preparation for a possible lawsuit.  The city and United agreed topreserve the evidence, Daoâ€TMs attorney said on Saturday. "))

In [47]:
single_prediction = clf.predict(vectorizer.transform([news]).toarray()) 
single_prediction

array([0])