In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("news.csv")
X_train, X_test, Y_train, Y_test = train_test_split(df[['text']], df['label'], test_size=0.2)
X_train.head()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

eng_stopwords = set(stopwords.words("english"))
addToSW = set(['\'',':',';'])
eng_stopwords.union(addToSW)
lemma = WordNetLemmatizer()
cv = TfidfVectorizer()

Defining Tag Map

In [None]:
tag_map = {
        'CC':None, # coordin. conjunction (and, but, or)  
        'CD':wn.NOUN, # cardinal number (one, two)             
        'DT':None, # determiner (a, the)                    
        'EX':wn.ADV, # existential ‘there’ (there)           
        'FW':None, # foreign word (mea culpa)             
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ,  # adj., comparative (bigger)          
        'JJS':wn.ADJ,  # adj., superlative (wildest)           
        'LS':None, # list item marker (1, 2, One)          
        'MD':None, # modal (can, should)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'POS':None, # possessive ending (’s )               
        'PRP':None, # personal pronoun (I, you, he)     
        'PRP$':None, # possessive pronoun (your, one’s)    
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':[wn.ADJ, wn.ADJ_SAT], # particle (up, off)
        'SYM':None, # symbol (+,%, &)
        'TO':None, # “to” (to)
        'UH':None, # interjection (ah, oops)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
        'WDT':None, # wh-determiner (which, that)
        'WP':None, # wh-pronoun (what, who)
        'WP$':None, # possessive (wh- whose)
        'WRB':None, # wh-adverb (how, where)
        '$':None, #  dollar sign ($)
        '#':None, # pound sign (#)
        '"':None, # left quote (‘ or “)
        '"':None, # right quote (’ or ”)
        '(':None, # left parenthesis ([, (, {, <)
        ')':None, # right parenthesis (], ), }, >)
        ',':None, # comma (,)
        '.':None, # sentence-final punc (. ! ?)
        ':':None # mid-sentence punc (: ; ... – -)
    }

Helper Functions and main preprocesses

In [None]:
def labelPreprocess(txt):
    if txt == "FAKE":
        return 0;
    return 1

def tokenTagingPreprocess(row):
    rtRow = []
    for token in row:
        if token.casefold() not in eng_stopwords:
            rtRow.append(token.casefold())
    return pos_tag(rtRow);

def lemmaPreprocess(row):
    rtRow = []
    for tag in row:
        try:
            rtRow.append(lemma.lemmatize(tag[0], pos=tag_map[tag[1]]));
        except:
            "err";
    return rtRow;

def makeSTR(row):
    return ' '.join(row)

def Prerocess(X, Y, is_test):

    df = pd.DataFrame(columns=['text', 'label'])
    
    df['label'] = Y.apply(labelPreprocess)
    
    # 1- Tokenization and remove StopWords:
    df['text'] = [word_tokenize(row) for row in X['text']]
    
    # 2- Get Pos-Tags: ('Ahmed','NNP')
    df['text'] = df['text'].apply(tokenTagingPreprocess)
    
    # 3- Lemmatiziation with pos-tag: is -> be
    df['textWork'] = df['text'].apply(lemmaPreprocess)
    
    # 4- Remove Empty-Rows (that only contains Stop Words): the that about -> ""
    df.drop(axis=1,columns=['text'],inplace=True)
    df = df.loc(len(df['textWork']) > 0).obj
    
    # 5- Rows Data type array -> string ["asda asdasd", "sadasd"]
    df['text'] = df['textWork'].apply(makeSTR)
    
    # 6- Generate TF-IDF
    if is_test == False:
        textBOW = cv.fit_transform(df['text'].array) 
    else:
        textBOW = cv.transform(df['text'].array)
    
    return [textBOW, df['label'].array]

def getTheTruth(title):
    tokens = word_tokenize(title)
    tags = tokenTagingPreprocess(tokens)
    lemmas = lemmaPreprocess(tags)
    corpus = [makeSTR(lemmas)]
    TfId = cv.transform(corpus)
    if pac.predict(TfId) == 1:
        return "Real"
    return "Fake"

Preprocess Model Training Data

In [None]:
TrainReadyData = Prerocess(X_train, Y_train, False)
trainText, trainLabel = TrainReadyData

Preprocess Model Tesing Data

In [None]:
TestReadyData = Prerocess(X_test, Y_test, True)
testText, testLabel = TestReadyData

Model Training & Testing


In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(trainText, trainLabel)

# Data Testing
y_pred = pac.predict(testText)

Data Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(testLabel, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Fake', 'Real'])
cm_display.plot(cmap='Purples')
plt.show()
print("Accuracy:\t",metrics.accuracy_score(testLabel, y_pred)
     ,"\nPrecision:\t",metrics.precision_score(testLabel, y_pred)
     ,"\nrecall:\t\t",metrics.recall_score(testLabel, y_pred)
     ,"\nF1_score:\t",metrics.f1_score(testLabel, y_pred))

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(trainText, trainLabel)

# Data Testing
y_pred = svm_model.predict(testText)

confusion_matrix = metrics.confusion_matrix(testLabel, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['Fake', 'Real'])
cm_display.plot(cmap='Purples')
plt.show()
print("Accuracy:\t",metrics.accuracy_score(testLabel, y_pred)
     ,"\nPrecision:\t",metrics.precision_score(testLabel, y_pred)
     ,"\nrecall:\t\t",metrics.recall_score(testLabel, y_pred)
     ,"\nF1_score:\t",metrics.f1_score(testLabel, y_pred))

Predict data

In [None]:
theySaid = '''"Washington (CNN) The faction of the GOP that is unhappy with Donald Trump as the party's presumptive nominee has one last plan to stop the mogul: staging an all-out delegate revolt at the Republican National Convention.'''
print("It is: ", getTheTruth(theySaid))