# Feature Extracting 

In [1]:
import pandas as pd

In [18]:
df = pd.read_csv('../2.data_preprocessing/pre.csv')

In [19]:
df.tail()

Unnamed: 0,News,Title,Label
246,As he traveled the country on his thank you to...,Donald Trumps Cabinet richest in US history hi...,Real
247,A federal judge in New York has unsealed the s...,Judge unseals warrant for search of Anthony We...,Real
248,On Sunday evening Donald Trump invited reporte...,Donald Trump invites press to offtherecord ses...,Real
249,WASHINGTON Allegations of retaliation against...,NSA watchdog put on leave in whistleblower case,Real
250,A group of millennial activists from across th...,Trump protesters plan to open movement house i...,Real


# Syntax Based

In [5]:
#char count
#word count
#title word count
#stop word count
#upper case word count

In [8]:
#char count
df['char_count'] = df['News'].str.len()

In [9]:
#word count
df['word_count'] = df['News'].str.split().str.len()

In [10]:
#title word count
df['title_word_count'] = df['Title'].str.split().str.len()

In [11]:
#stopword count
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

df['stopword_count'] = df['News'].str.split().apply(lambda x: len(set(x) & stop_words))

In [12]:
#upper case word count
df['upper_case'] = df['News'].str.count(r'[A-Z]')

In [13]:
df.head()

Unnamed: 0,News,Title,Label,char_count,word_count,title_word_count,stopword_count,upper_case
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,1169,223,9,38,37
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,2291,376,18,57,146
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,1381,247,11,34,68
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,503,93,9,23,144
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,7381,1317,9,86,221


# Sentiment Based

In [20]:
#polarity 
#subjectivity

In [21]:
from textblob import TextBlob

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
  
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
  
#Create two new columns ‘Subjectivity’ & ‘Polarity’
df['subjectivity_title'] = df['Title'].apply(getSubjectivity)
df['polarity_title'] = df['Title'].apply(getPolarity)


In [22]:
df.head()

Unnamed: 0,News,Title,Label,subjectivity_title,polarity_title
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,0.0,0.0
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,0.05,0.1
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,0.4,-0.2
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,0.0,0.0
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,0.4,-0.2


# Grammar Based

In [23]:
# count noun
# count verb
# count adjective
# count pronoun
# count adverb

In [24]:
from nltk import word_tokenize, pos_tag, pos_tag_sents

In [25]:
texts = df['Title'].tolist()

In [26]:
len(texts)

251

In [27]:
tagged_texts = pos_tag_sents(map(word_tokenize, texts))

In [28]:
df['POS'] = tagged_texts

In [29]:
def NounCounter(x):
    nouns = []
    for (word, pos) in x:
        if pos.startswith("NN"):
            nouns.append(word)
    return nouns

def CounjuctionCounter(x):
    counjuctions = []
    for (word, pos) in x:
        if pos.startswith("IN"):
            counjuctions.append(word)
    return counjuctions

def AdjectiveCounter(x):
    adjectives = []
    for (word, pos) in x:
        if pos.startswith("JJ"):
            adjectives.append(word)
    return adjectives

def ProNounCounter(x):
    pronouns = []
    for (word, pos) in x:
        if pos.startswith("PRP"):
            pronouns.append(word)
    return pronouns

def AdverbCounter(x):
    adverb = []
    for (word, pos) in x:
        if pos.startswith("RB"):
            adverb.append(word)
    return adverb

def PrepositionCounter(x):
    prep = []
    for (word, pos) in x:
        if pos.startswith("TO"):
            prep.append(word)
    return prep

def InterjectionCounter(x):
    inter = []
    for (word, pos) in x:
        if pos.startswith("UH"):
            inter.append(word)
    return inter

def VerbCounter(x):
    verbs = []
    for (word, pos) in x:
        if pos.startswith("VB"):
            verbs.append(word)
    return verbs

In [30]:
df["nouns_title"] = df["POS"].apply(NounCounter)
df["noun_count_title"] = df["nouns_title"].str.len()

In [31]:
df["conjuctions_title"] = df["POS"].apply(CounjuctionCounter)
df["conjuction_count_title"] = df["conjuctions_title"].str.len()

In [32]:
df["adjectives_title"] = df["POS"].apply(AdjectiveCounter)
df["adjective_count_title"] = df["adjectives_title"].str.len()

In [33]:
df["pronouns_title"] = df["POS"].apply(ProNounCounter)
df["pronoun_count_title"] = df["pronouns_title"].str.len()

In [34]:
df["adverbs_title"] = df["POS"].apply(AdverbCounter)
df["adverb_count_title"] = df["adverbs_title"].str.len()

In [35]:
df["prepositions_title"] = df["POS"].apply(PrepositionCounter)
df["prepositions_count_title"] = df["prepositions_title"].str.len()

In [36]:
df["interjections_title"] = df["POS"].apply(InterjectionCounter)
df["interjection_count_title"] = df["interjections_title"].str.len()

In [37]:
df["verbs_title"] = df["POS"].apply(VerbCounter)
df["verb_count_title"] = df["verbs_title"].str.len()

In [38]:
df.head()

Unnamed: 0,News,Title,Label,subjectivity_title,polarity_title,POS,nouns_title,noun_count_title,conjuctions_title,conjuction_count_title,...,pronouns_title,pronoun_count_title,adverbs_title,adverb_count_title,prepositions_title,prepositions_count_title,interjections_title,interjection_count_title,verbs_title,verb_count_title
0,The Internet is buzzing today after white supr...,Million Uncounted Sanders Ballots Found On Cl...,Fake,0.0,0.0,"[(Million, NNP), (Uncounted, VBD), (Sanders, N...","[Million, Sanders, Ballots, Found, Clintons, E...",7,[On],1,...,[],0,[],0,[],0,[],0,[Uncounted],1
1,TRUMP TOWER — With his poll numbers among blac...,African Billionaire Will Give Million To Anyo...,Fake,0.05,0.1,"[(African, JJ), (Billionaire, NNP), (Will, NNP...","[Billionaire, Will, Give, Million, Anyone, Ame...",9,[if],1,...,[],0,[],0,"[To, To]",2,[],0,"[Wants, Leave, is, Elected]",4
2,Donald Trump is looking to assemble a strong t...,BREAKING Another Clinton Associate Set To Test...,Fake,0.4,-0.2,"[(BREAKING, NN), (Another, DT), (Clinton, NNP)...","[BREAKING, Clinton, Associate, Set, Against, H...",8,[],0,...,[],0,[],0,[To],1,[],0,[Testify],1
3,“SHE’S SEXY SMART SOPHISTICATED AND SHE’S INTO...,Breaking Fraudulent Clinton Votes Discovered B...,Fake,0.0,0.0,"[(Breaking, VBG), (Fraudulent, NNP), (Clinton,...","[Fraudulent, Clinton, Votes, Discovered, Thous...",5,"[By, Of]",2,...,[],0,[],0,[],0,[],0,[Breaking],1
4,PHOENIX AZ AP — For months now rumors have cir...,BREAKING Official Set to Testify Against Hilla...,Fake,0.4,-0.2,"[(BREAKING, NNP), (Official, NNP), (Set, NNP),...","[BREAKING, Official, Set, Against, Hillary, Fo...",7,[],0,...,[],0,[],0,[to],1,[],0,[Testify],1


In [39]:
df.columns

Index(['News', 'Title', 'Label', 'subjectivity_title', 'polarity_title', 'POS',
       'nouns_title', 'noun_count_title', 'conjuctions_title',
       'conjuction_count_title', 'adjectives_title', 'adjective_count_title',
       'pronouns_title', 'pronoun_count_title', 'adverbs_title',
       'adverb_count_title', 'prepositions_title', 'prepositions_count_title',
       'interjections_title', 'interjection_count_title', 'verbs_title',
       'verb_count_title'],
      dtype='object')

In [40]:
df.to_csv('feature_extractor_title.csv', encoding='utf-8', index=False)