In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
# READ DATASETS
fake = pd.read_csv("Fake.csv");
true = pd.read_csv("True.csv");

In [None]:
fake.shape

In [None]:
true.shape

In [None]:
# DATA CLEANING AND PREPERATION
# Add flag to track fake and real
fake['target'] = 'fake';
true['target'] = 'true';

In [None]:
fake.head()

In [None]:
true.head()

In [None]:
# CONCATENATE DATAFRAMES
data = pd.concat([fake, true]).reset_index(drop = True)
data.shape

In [None]:
data.head(5)

In [None]:
data.tail(5)

In [None]:
# SHUFFLE THE DATA
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop = True)

In [None]:
# CHECK THE DATA
data.head()

In [None]:
data.info()

In [None]:
# REMOVING THE TITLE
data.drop(["title"], axis =1, inplace = True)
data.head()

In [None]:
# CONVERT TO LOWERCASE
data['text'] = data['text'].apply(lambda x: x.lower())
data.head()

In [None]:
# REMOVE PUNCTUATION(clean our all the news text contest)
import string
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str
data['text'] = data['text'].apply(punctuation_removal)

In [None]:
# CHECK
data.head()

In [None]:
# REMOVING STOPWORDS
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('English')

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

In [None]:
data.head()

In [None]:
# BASic DATA EXPLORATION(plot all different category how many news records we have )
# how many articles per subjects?
print(data.groupby(['subject'])['text'].count())
data.groupby(['subject'])['text'].count().plot(kind = "bar")
plt.show()

In [None]:
# how many fake and real articles?
print(data.groupby(['target'])['text'].count())
data.groupby(['target'])['text'].count().plot(kind = "bar")
plt.show()

In [None]:
# Most frequently word counter
from nltk import tokenize
token_space = tokenize.WhitespaceTokenizer()

def counter(text, column_text, quantity):
    all_word = ' '.join([text for text in text[column_text]])
    token_phrase = token_space.tokenize(all_word)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),"frequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns="frequency", n = quantity)
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data = df_frequency, x = "Word", y="frequency", color = "blue")
    ax.set(ylabel = "Count")
    plt.xticks(rotation = 'vertical')
    plt.show()

In [None]:
counter(data[data["target"] == "fake"],"text",20);

In [None]:
counter(data[data["target"] == "true"],"text",20);

In [None]:
# Modeling
# function to plot the confusion matrix
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,normalize = False, title = 'confusion matrix', cmap = plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float')/cm.sum(axis = 1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
        thres = cm.max()/2
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,i,cm[i,j], horizontalalignment = "centre",
            color = "white"
            if cm[i,j] > thres else "black")
            plt.tight_layout()
            plt.ylabel('True label')
            plt.xlabel('predicted label')

In [None]:
# Split Data
x_train, x_test, y_train , y_test = train_test_split(data['text'], data.target, test_size = 0.2, random_state=42)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
# Decision Tree Classifier--- this algo gives a perfect identifier that is why this algorithm is used

from sklearn.tree import DecisionTreeClassifier

# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', DecisionTreeClassifier(criterion='entropy', max_depth=20, splitter='best', random_state=42))])

# Fitting the model
model = pipe.fit(x_train,y_train)

# Accuracy
prediction = model.predict(x_test)
print("accuracy:{}%".format(round(accuracy_score(y_test, prediction)*100,2)))

In [None]:
cm = metrics.confusion_matrix(y_test, prediction)

# plot_confusion_matrix(cm, classes=['fake', 'Real'] )

In [None]:
cm

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(cm, annot = True, fmt = 'd')

In [None]:
x_test.shape

In [None]:
t = '''The advent of the World Wide Web and the rapid adoption of social media platforms (such as Facebook and Twitter) paved the way for information dissemination that has never been witnessed in the human history before. Besides other use cases, news outlets benefitted from the widespread use of social media platforms by providing updated news in near real time to its subscribers. The news media evolved from newsp became easier for consumers to acquire the latest news at their fingertips. Facebook referrals account for 70% of traffic to news websites [2]. These social media platforms in their current state are extremely powerful and useful for their ability to allow users to discuss and share ideas and debate over issues such as democracy, education, and health. However, such platforms are also used with a negative perspective by certain entities commonly for monetary gain [3, 4] and in other cases for creating biased opinions, manipulating mindsets, and spreading satire or absurdity. The phenomenon is commonly known as fake news.'''

In [None]:
model.predict(np.array([t]))

In [None]:
np.array([t])

In [51]:
import pickle

In [56]:
file =open('model.h5','wb')

In [57]:
pickle.dump(model,file)

In [58]:
file.close()

In [59]:
import os

In [60]:
os.getcwd()

'C:\\Users\\Leesha\\PycharmProjects\\MachineLearning'