In [32]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split

df = pd.read_csv("news.csv")
X_train, X_test, Y_train, Y_test = train_test_split(df[['title', 'text']], df['label'], test_size=0.2, random_state=42)
X_train

Unnamed: 0,title,text
1142,Alabama Sen. Sessions Backs Trump’s Immigratio...,Donald Trump received a key endorsement for hi...
2654,"As of 6:00 AM NOVEMBER 6th, Trump is leading i...","Nina November 6, 2016 @ 2:39 pm \nPolish gover..."
5395,Time: Investigating Hillary is an Attack on Al...,Time: Investigating Hillary is an Attack on Al...
1170,Women Should Vote With Their Husbands,"Taki's Magazine October 28, 2016 \nThis electi..."
4371,Pakistan police detain dozens of Imran Khan's ...,Pakistan Pakistan's cricketer turned politicia...
...,...,...
3772,The inane spectacle of the GOP debate: Cruz th...,What happened was less a debate among contende...
5191,"Clinton, FBIGate and the true depth of the Oba...","Clinton, FBIGate and the true depth of the Oba..."
5226,"Fearing Election Day Trouble, Some US Schools ...","Fearing Election Day Trouble, Some US Schools ..."
5390,Obama gets away with some whoppers on guns at ...,President Obama’s appearance at a town hall me...


In [33]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

eng_stopwords = set(stopwords.words("english"))
addToSW = set(['\'',':',';','\''])
eng_stopwords.union(addToSW)
lemma = WordNetLemmatizer()
cv= TfidfVectorizer()

In [34]:
tag_map = {
        'CC':None, # coordin. conjunction (and, but, or)  
        'CD':wn.NOUN, # cardinal number (one, two)             
        'DT':None, # determiner (a, the)                    
        'EX':wn.ADV, # existential ‘there’ (there)           
        'FW':None, # foreign word (mea culpa)             
        'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
        'JJ':wn.ADJ, # adjective (yellow)                  
        'JJR':wn.ADJ,  # adj., comparative (bigger)          
        'JJS':wn.ADJ,  # adj., superlative (wildest)           
        'LS':None, # list item marker (1, 2, One)          
        'MD':None, # modal (can, should)                    
        'NN':wn.NOUN, # noun, sing. or mass (llama)          
        'NNS':wn.NOUN, # noun, plural (llamas)                  
        'NNP':wn.NOUN, # proper noun, sing. (IBM)              
        'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
        'PDT':wn.ADJ, # predeterminer (all, both)            
        'POS':None, # possessive ending (’s )               
        'PRP':None, # personal pronoun (I, you, he)     
        'PRP$':None, # possessive pronoun (your, one’s)    
        'RB':wn.ADV, # adverb (quickly, never)            
        'RBR':wn.ADV, # adverb, comparative (faster)        
        'RBS':wn.ADV, # adverb, superlative (fastest)     
        'RP':[wn.ADJ, wn.ADJ_SAT], # particle (up, off)
        'SYM':None, # symbol (+,%, &)
        'TO':None, # “to” (to)
        'UH':None, # interjection (ah, oops)
        'VB':wn.VERB, # verb base form (eat)
        'VBD':wn.VERB, # verb past tense (ate)
        'VBG':wn.VERB, # verb gerund (eating)
        'VBN':wn.VERB, # verb past participle (eaten)
        'VBP':wn.VERB, # verb non-3sg pres (eat)
        'VBZ':wn.VERB, # verb 3sg pres (eats)
        'WDT':None, # wh-determiner (which, that)
        'WP':None, # wh-pronoun (what, who)
        'WP$':None, # possessive (wh- whose)
        'WRB':None, # wh-adverb (how, where)
        '$':None, #  dollar sign ($)
        '#':None, # pound sign (#)
        '“':None, # left quote (‘ or “)
        '”':None, # right quote (’ or ”)
        '(':None, # left parenthesis ([, (, {, <)
        ')':None, # right parenthesis (], ), }, >)
        ',':None, # comma (,)
        '.':None, # sentence-final punc (. ! ?)
        ':':None # mid-sentence punc (: ; ... – -)
    }

In [35]:
###################### Helper Functions ######################

def labelPreprocess(txt):
    if txt == "FAKE":
        return 0;
    return 1

def tokenTagingPreprocess(row):
    rtRow = []
    for token in row:
        if token.casefold() not in eng_stopwords:
            rtRow.append(token.casefold())
    return pos_tag(rtRow);

def lemmaPreprocess(row):
    rtRow = []
    for tag in row:
        try:
            rtRow.append(lemma.lemmatize(tag[0], pos=tag_map[tag[1]]));
        except:
            "err";
    return rtRow;

def fittingDT(row):
    Rstr = ' '.join(row)
    cv.fit_transform([Rstr])
    
def makeSTR(row):
    return ' '.join(row)

In [41]:
def Prerocess(X, Y, is_test):
    # container Dataframe
    df = pd.DataFrame(columns=['title', 'text', 'label'])
    
    # y-test
    df['label'] = Y.apply(labelPreprocess)
    
    # x-test
    # 1- Tokenization:
    df['title'] = [word_tokenize(row) for row in X['title']]
    df['text'] = [word_tokenize(row) for row in X['text']]
    
    # 2- Get Pos-Tags:
    df['title'] = df['title'].apply(tokenTagingPreprocess)
    df['text'] = df['text'].apply(tokenTagingPreprocess)
    
    # 3- Lemmatiziation:
    df['titleWork'] = df['title'].apply(lemmaPreprocess)
    df['textWork'] = df['text'].apply(lemmaPreprocess)
    
    # 4- Remove Empty-Rows (that only contains Stop Words):
    df.drop(axis=1,columns=['title','text'],inplace=True)
    df = df.loc(len(df['titleWork']) > 0 & len(df['textWork']) > 0).obj
    
    # 5- Rows Data type array -> string
    df['title'] = df['titleWork'].apply(makeSTR)
    df['text'] = df['textWork'].apply(makeSTR)
    
    # 6- Generate TF-IDF
    if is_test == False:
        textBOW = cv.fit_transform(df['text'].array)
        titleBOW = cv.fit_transform(df['title'].array)
    else:
        textBOW = cv.transform(df['text'].array)
        titleBOW = cv.transform(df['title'].array)
    
    return [titleBOW, textBOW, df['label'].array];

In [37]:
TrainReadyData = Prerocess(X_train, Y_train, False)
trainTitle, trainText, trainLabel = TrainReadyData

In [42]:
TestReadyData = Prerocess(X_test, Y_test, True)
testTitle, testText, testLabel = TestReadyData

In [43]:
# Modeling with title
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(max_iter=200)
clf.fit(trainText, trainLabel)

In [45]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(testText.)
acc = accuracy_score(testLabel, y_pred)
print("Accuracy:", acc)

AttributeError: 'TfidfVectorizer' object has no attribute 'reshape'

In [None]:
def getTheTruth(title):
    tokens = word_tokenize(title)
    tags = tokenTagingPreprocess(tokens)
    lemmas = lemmaPreprocess(tags)
    corpus = [makeSTR(lemmas)]
    TfId = cv.transform(corpus)
    if clf.predict(TfId) == 1:
        return "Real"
    return "Fake"

In [None]:
print("It is: ", getTheTruth('8476,You Can Smell Hell'))