In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()
from nltk.tokenize import word_tokenize
import os

[nltk_data] Downloading package vader_lexicon to C:\Users\Brigitta
[nltk_data]     Bartsch\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df_fake = pd.read_csv("C:\\Users\\Brigitta Bartsch\\code\\PROJECT_Fake_News_Detection\\fake_news_buster\\fake_news_buster\\data\\Fake.csv")
df_true = pd.read_csv("C:\\Users\\Brigitta Bartsch\\code\\PROJECT_Fake_News_Detection\\fake_news_buster\\fake_news_buster\\data\\True.csv")

In [3]:
df_fake = df_fake
df_true = df_true

## data cleaning

In [4]:
def import_merge_df(df_fake,df_true):
    '''Import DataFrames and merge them, adding true/false encodings'''
    data_fake = df_fake
    data_true = df_true
    data_fake["true/fake"] = 1
    data_fake["true/fake_description"] = "fake"
    data_true["true/fake"] = 0
    data_true["true/fake_description"] = "true"
    data_concat = pd.concat([data_fake, data_true])
    data_concat_reset_index = data_concat.reset_index(drop=True)
    return data_concat_reset_index

In [5]:
def try_parsing_date(text):
    for fmt in ('%d-%b-%y', '%B %d, %Y', '%b %d, %Y','%b %d, %Y ','%B %d, %Y '):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    return np.nan

In [6]:
def Data_Cleaning(df_1_path_fake, df_2_path_true):
    '''Delete useless rows (with https..in every column)
        and adjust datetime object'''
    #Call merge/import function
    data_concat_reset_index = import_merge_df(df_1_path_fake,df_2_path_true)
    #Filter out wrong "https"-values
    list_indexes_to_drop = data_concat_reset_index.query('date.str.contains("https")').index
    data = data_concat_reset_index.drop(data_concat_reset_index.index[list_indexes_to_drop])
    #Convert date to datetimeobjects
    data["date"] = data["date"].map(try_parsing_date)
    return data

## sentiment analysis

In [7]:
def get_polarity(x):
    x = TextBlob(x)
    return x.sentiment[0]

In [8]:
def get_subjectivity(x):
    x = TextBlob(x)
    return x.sentiment[1]

In [9]:
def feature_polarity_subjectivity(df, column):
    new_column_name_polarity = f'{column}_TextBlob_polarity_score'
    df[new_column_name_polarity] = df[column].apply(get_polarity)
    new_column_name_subjectivity = f'{column}_TextBlob_subjectivity_score'
    df[new_column_name_subjectivity] = df[column].apply(get_subjectivity)
    return df 

### darth vader score

In [10]:
def feature_vader_polarity_scores(df, column):
    new_column_name = f'{column}_Vader_negative_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["neg"])
    new_column_name = f'{column}_Vader_neutral_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["neu"])
    new_column_name = f'{column}_Vader_positive_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["pos"])
    new_column_name = f'{column}_Vader_compound_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["compound"])
    return df

## length of articles

In [11]:
def no_chracters(text):
    for i in text:
          fake_charac = len(text)
    return fake_charac

In [12]:
def no_characters_df(df, column):
    new_column_name = f'{column}_no_characters'
    df[new_column_name] = df[column].apply(no_chracters)
    return df

## punctuation ratio, Upper case letters ratio, numbers ratio

In [13]:
def character_ratiorizer(text):
    quotes = ['\"', '\"']
    quote_no = 0
    for symbol in text:
        if symbol in quotes:
            quote_no += 1
    return quote_no/len(text)

In [14]:
def is_upperizer(text):
    upper_no = 0
    for word in text:
        if word.isupper():
            upper_no += 1
    return upper_no/len(text)

In [15]:
def is_digiter(text):
    digit_no = 0
    for word in text:
        if word.isdigit():
            digit_no += 1
    return digit_no/len(text)

## richness

In [16]:
def vocab_richnesser(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    try:
        return unique_word_length/total_length
    except ZeroDivisionError:
        return 0

## typos

In [17]:
pass

## final call of functions 

In [18]:
'''data cleaning'''
df = Data_Cleaning(df_fake, df_true)

In [19]:
df

Unnamed: 0,title,text,subject,date,true/fake,true/fake_description
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,1,fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,1,fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,1,fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,1,fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,1,fake
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,2017-12-25,1,fake
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,2017-12-23,1,fake
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,2017-12-23,1,fake
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,2017-12-22,1,fake
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,2017-12-21,1,fake


In [20]:
'''sentiment analysis'''
df = feature_polarity_subjectivity(df, 'text')
df = feature_vader_polarity_scores(df, 'text')

df = feature_polarity_subjectivity(df, 'title')
df = feature_vader_polarity_scores(df, 'title')

In [21]:
'''lenght of articles'''
df = no_characters_df(df, 'text')

df = no_characters_df(df, 'title')

In [22]:
'''punctuation ratio, Upper case letter ratio, numbers ratio'''
df['character_ratio'] = df['title'].apply(character_ratiorizer)
df['upper_case_ratio'] = df['title'].apply(is_upperizer)
df['numbers_ratio'] = df['title'].apply(is_digiter)

In [23]:
'''richness of vocab'''
df['vocab_richness_text'] = df['text'].apply(vocab_richnesser)

df['vocab_richness_title'] = df['title'].apply(vocab_richnesser)

In [25]:
df.head(2)

Unnamed: 0,title,text,subject,date,true/fake,true/fake_description,text_TextBlob_polarity_score,text_TextBlob_subjectivity_score,text_Vader_negative_score,text_Vader_neutral_score,...,title_Vader_neutral_score,title_Vader_positive_score,title_Vader_compound_score,text_no_characters,title_no_characters,character_ratio,upper_case_ratio,numbers_ratio,vocab_richness_text,vocab_richness_title
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,1,fake,0.082132,0.599895,0.143,0.705,...,0.629,0.0,-0.7096,2893,79,0.0,0.139241,0.0,0.435726,1.0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,1,fake,-0.005004,0.334098,0.089,0.834,...,0.745,0.0,-0.34,1898,69,0.0,0.115942,0.0,0.595166,1.0


In [24]:
df.to_csv("C:\\Users\\Brigitta Bartsch\\code\\PROJECT_Fake_News_Detection\\fake_news_buster\\fake_news_buster\\data\\dataframe_compiled.csv")