# Feature Extracting 

In [1]:
#importing libraries
import pandas as pd

In [2]:
#assigning data path
df = pd.read_csv('../2.data_preprocessing/pre.csv')

In [4]:
#checking end of dataframe
df.tail()

Unnamed: 0,News,Title,Label
246,As he traveled the country on his thank you to...,Donald Trumps Cabinet richest in US history hi...,Real
247,A federal judge in New York has unsealed the s...,Judge unseals warrant for search of Anthony We...,Real
248,On Sunday evening Donald Trump invited reporte...,Donald Trump invites press to offtherecord ses...,Real
249,WASHINGTON Allegations of retaliation against...,NSA watchdog put on leave in whistleblower case,Real
250,A group of millennial activists from across th...,Trump protesters plan to open movement house i...,Real


# Syntax Based

In [None]:
#char count
#word count
#title word count
#stop word count
#upper case word count

In [5]:
#char count
df['char_count'] = df['News'].str.len()

In [6]:
#word count
df['word_count'] = df['News'].str.split().str.len()

In [7]:
#title word count
df['title_word_count'] = df['Title'].str.split().str.len()

In [8]:
#stopword count
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

df['stopword_count'] = df['News'].str.split().apply(lambda x: len(set(x) & stop_words))

In [9]:
#upper case word count
df['upper_case'] = df['News'].str.count(r'[A-Z]')

In [10]:
df.head()

Unnamed: 0,News,Title,Label,char_count,word_count,title_word_count,stopword_count,upper_case
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,1169,223,9,38,37
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,2291,376,18,57,146
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,1381,247,11,34,68
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,503,93,9,23,144
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,7381,1317,9,86,221


# Sentiment Based

In [11]:
#polarity 
#subjectivity

In [12]:
from textblob import TextBlob

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
  
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Create two new columns ‘Subjectivity’ & ‘Polarity’
df['subjectivity'] = df['News'].apply(getSubjectivity)
df['polarity'] = df['News'].apply(getPolarity)


In [13]:
df.head()

Unnamed: 0,News,Title,Label,char_count,word_count,title_word_count,stopword_count,upper_case,subjectivity,polarity
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,1169,223,9,38,37,0.2625,-0.025
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,2291,376,18,57,146,0.462568,0.1108
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,1381,247,11,34,68,0.558531,0.111726
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,503,93,9,23,144,0.47619,0.283036
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,7381,1317,9,86,221,0.434617,0.12671


# Grammar Based

In [None]:
# count noun
# count verb
# count adjective
# count pronoun
# count adverb

In [14]:
#importing libraries
from nltk import word_tokenize, pos_tag, pos_tag_sents

In [15]:
#convering news column to list
texts = df['News'].tolist()

In [16]:
len(texts)

251

In [17]:
#applying pos tagger
tagged_texts = pos_tag_sents(map(word_tokenize, texts))

In [18]:
#saving in a column
df['POS'] = tagged_texts

In [19]:
def NounCounter(x):
    nouns = []
    for (word, pos) in x:
        if pos.startswith("NN"):
            nouns.append(word)
    return nouns

def CounjuctionCounter(x):
    counjuctions = []
    for (word, pos) in x:
        if pos.startswith("IN"):
            counjuctions.append(word)
    return counjuctions

def AdjectiveCounter(x):
    adjectives = []
    for (word, pos) in x:
        if pos.startswith("JJ"):
            adjectives.append(word)
    return adjectives

def ProNounCounter(x):
    pronouns = []
    for (word, pos) in x:
        if pos.startswith("PRP"):
            pronouns.append(word)
    return pronouns

def AdverbCounter(x):
    adverb = []
    for (word, pos) in x:
        if pos.startswith("RB"):
            adverb.append(word)
    return adverb

def PrepositionCounter(x):
    prep = []
    for (word, pos) in x:
        if pos.startswith("TO"):
            prep.append(word)
    return prep

def InterjectionCounter(x):
    inter = []
    for (word, pos) in x:
        if pos.startswith("UH"):
            inter.append(word)
    return inter

def VerbCounter(x):
    verbs = []
    for (word, pos) in x:
        if pos.startswith("VB"):
            verbs.append(word)
    return verbs

In [20]:
#finding nouns and noun count
df["nouns"] = df["POS"].apply(NounCounter)
df["noun_count"] = df["nouns"].str.len()

In [21]:
#finding conjunction and conjunction count
df["conjuctions"] = df["POS"].apply(CounjuctionCounter)
df["conjuction_count"] = df["conjuctions"].str.len()

In [22]:
#finding adjective and adjective count
df["adjectives"] = df["POS"].apply(AdjectiveCounter)
df["adjective_count"] = df["adjectives"].str.len()

In [23]:
#finding pronouns and pronoun count
df["pronouns"] = df["POS"].apply(ProNounCounter)
df["pronoun_count"] = df["pronouns"].str.len()

In [24]:
#finding adverbs and adverb count
df["adverbs"] = df["POS"].apply(AdverbCounter)
df["adverb_count"] = df["adverbs"].str.len()

In [25]:
#finding prepositions and preposition count
df["prepositions"] = df["POS"].apply(PrepositionCounter)
df["prepositions_count"] = df["prepositions"].str.len()

In [26]:
#finding interjections and interjection count
df["interjections"] = df["POS"].apply(InterjectionCounter)
df["interjection_count"] = df["interjections"].str.len()

In [27]:
#finding verbs and verbs count
df["verbs"] = df["POS"].apply(VerbCounter)
df["verb_count"] = df["verbs"].str.len()

In [28]:
df.head()

Unnamed: 0,News,Title,Label,char_count,word_count,title_word_count,stopword_count,upper_case,subjectivity,polarity,...,pronouns,pronoun_count,adverbs,adverb_count,prepositions,prepositions_count,interjections,interjection_count,verbs,verb_count
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,1169,223,9,38,37,0.2625,-0.025,...,"[his, it, I, I, I, it, it, his, he, I, him, he...",32,"[maybe, back, not, not, So, now]",6,"[to, to, na, to, to, to, to, to, to]",9,[],0,"[is, buzzing, was, caught, snorting, brought, ...",54
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,2291,376,18,57,146,0.462568,0.1108,...,"[his, his, She, she, you, her, She, we, her, W...",37,"[around, reportedly, ll, recently, now, more, ...",22,"[to, to, to, to, to, to, To, to, to]",9,[],0,"[hovering, tapped, connects, said, ’, be, see,...",91
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,1381,247,11,34,68,0.558531,0.111726,...,"[he, he, his, His, His, it, he, I, We, I, he, ...",32,"[earlier, not, thus, far, already]",5,"[to, to, to, to, to]",5,[],0,"[is, looking, assemble, enters, expected, cons...",62
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,503,93,9,23,144,0.47619,0.283036,...,"[his, her, you, it]",4,[not],1,[],0,[],0,"[ARE, choosing, running, did, vote, comes, was...",11
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,7381,1317,9,86,221,0.434617,0.12671,...,"[he, I, I, I, them, me, my, I, I, I, they, me,...",143,"[now, rally, back, forward, rally, “, mostly, ...",56,"[to, to, to, to, to, to, to, to, to, to, to, t...",35,[],0,"[have, circulated, were, being, paid, protest,...",299


In [29]:
df.columns

Index(['News', 'Title', 'Label', 'char_count', 'word_count',
       'title_word_count', 'stopword_count', 'upper_case', 'subjectivity',
       'polarity', 'POS', 'nouns', 'noun_count', 'conjuctions',
       'conjuction_count', 'adjectives', 'adjective_count', 'pronouns',
       'pronoun_count', 'adverbs', 'adverb_count', 'prepositions',
       'prepositions_count', 'interjections', 'interjection_count', 'verbs',
       'verb_count'],
      dtype='object')

In [30]:
#saving dataframe to csv file
df.to_csv('feature_extractor.csv', encoding='utf-8', index=False)