In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
from spellchecker import SpellChecker
import string
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/isa/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df_fake = pd.read_csv("../raw_data/Fake.csv")
df_true = pd.read_csv("../raw_data/True.csv")

In [3]:
df_fake = df_fake.head(20)
df_true = df_true.head(20)

## data cleaning

In [4]:
def import_merge_df(df_fake,df_true):
    '''Import DataFrames and merge them, adding true/false encodings'''
    data_fake = df_fake
    data_true = df_true
    # data_fake = pd.read_csv(df_1)
    # data_true = pd.read_csv(df_2)
    data_fake["true/false"] = 1
    data_fake["true/false_description"] = "fake"
    data_true["true/false"] = 0
    data_true["true/false_description"] = "true"
    data_concat = pd.concat([data_fake, data_true])
    data_concat_reset_index = data_concat.reset_index(drop=True)
    return data_concat_reset_index

In [5]:
def try_parsing_date(text):
    for fmt in ('%d-%b-%y', '%B %d, %Y', '%b %d, %Y','%b %d, %Y ','%B %d, %Y '):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    return np.nan

In [6]:
def Data_Cleaning(df_1_path_fake, df_2_path_true):
    '''Delete useless rows (with https..in every column)
        and adjust datetime object'''
    #Call merge/import function
    data_concat_reset_index = import_merge_df(df_1_path_fake,df_2_path_true)
    #Filter out wrong "https"-values
    list_indexes_to_drop = data_concat_reset_index.query('date.str.contains("https")').index
    data = data_concat_reset_index.drop(data_concat_reset_index.index[list_indexes_to_drop])
    #Convert date to datetimeobjects
    data["date"] = data["date"].map(try_parsing_date)
    return data

## Weekday

In [7]:
def weekday(day):
    weekday = day.dt.day_name()
    return weekday

In [8]:
def ohe_weekday(column):
    ohe = OneHotEncoder(sparse = False)
    ohe.fit(df[['weekday']])
    hair_length_oh = ohe.transform(df[['weekday']])
    df["day_friday"],df["day_monday"],df['day_saturday'],df['day_sunday'],df['day_thursday'],df['day_tuesday'],df['day_wednesday'] = hair_length_oh.T

## sentiment analysis

In [9]:
def get_polarity(x):
    x = TextBlob(x)
    return x.sentiment[0]

In [10]:
def get_subjectivity(x):
    x = TextBlob(x)
    return x.sentiment[1]

In [11]:
def feature_polarity_subjectivity(df, column):
    new_column_name_polarity = f'{column}_TextBlob_polarity_score'
    df[new_column_name_polarity] = df[column].apply(get_polarity)
    new_column_name_subjectivity = f'{column}_TextBlob_subjectivity_score'
    df[new_column_name_subjectivity] = df[column].apply(get_subjectivity)
    return df 

### darth vader score

In [12]:
def feature_vader_polarity_scores(df, column):
    new_column_name = f'{column}_Vader_negative_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["neg"])
    new_column_name = f'{column}_Vader_neutral_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["neu"])
    new_column_name = f'{column}_Vader_positive_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["pos"])
    new_column_name = f'{column}_Vader_compound_score'
    df[new_column_name] = df[column].apply(lambda x: vader.polarity_scores(x)["compound"])
    return df

## length of articles

In [13]:
def no_chracters(text):
    for i in text:
          fake_charac = len(text)
    return fake_charac

In [14]:
def no_characters_df(df, column):
    new_column_name = f'{column}_no_characters'
    df[new_column_name] = df[column].apply(no_chracters)
    return df

## punctuation ratio, Upper case letters ratio, numbers ratio

In [15]:
def character_ratiorizer(text):
    quotes = ['\"', '\"']
    quote_no = 0
    for symbol in text:
        if symbol in quotes:
            quote_no += 1
    return quote_no/len(text)

In [16]:
def is_upperizer(text):
    upper_no = 0
    for word in text:
        if word.isupper():
            upper_no += 1
    return upper_no/len(text)

In [17]:
def is_digiter(text):
    digit_no = 0
    for word in text:
        if word.isdigit():
            digit_no += 1
    return digit_no/len(text)

## richness

In [18]:
def vocab_richnesser(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    try:
        return unique_word_length/total_length
    except ZeroDivisionError:
        return 0

## typos count ratio

In [19]:
def preprocess_typos(text):
    text = text.replace(" t ", "'t ")
    text = text.replace(" t.", "'t.")
    text = text.replace(" t,", "'t,")
    text = text.replace(" t!", "'t!")
    text = text.replace(" t?", "'t?")
    text = text.replace(" s ", "'s ")
    text = text.replace(" s.", "'s.")
    text = text.replace(" s,", "'s,")
    text = text.replace(" s!", "'s!")
    text = text.replace(" s?", "'s?")
    text.split()
    for x in string.punctuation.replace("'", ""):
        text = text.replace(x, '')
    text = ''.join(word for word in text if not word.isdigit())
    return text

In [20]:
def typo_ratiorizer(text):
    spell = SpellChecker()
    misspells = spell.unknown(text)
    return len(misspells)/len(text)

## scaler

In [34]:
def scaler(df): 
    scaler = MinMaxScaler()
    scaler.fit(df.drop(['title', 'text', 'date', 'true/false', 'true/false_description'], axis=1))
    scaled_vals = scaler.transform(df.drop(['title', 'text','date', 'true/false', 'true/false_description'], axis=1))
    df[['day_friday',
        'day_monday',
        'day_saturday',
        'day_sunday',
        'day_thursday',
        'day_tuesday',
        'day_wednesday',
        'text_TextBlob_polarity_score',
        'text_TextBlob_subjectivity_score',
        'text_Vader_negative_score',
        'text_Vader_neutral_score',
        'text_Vader_positive_score',
        'text_Vader_compound_score',
        'title_TextBlob_polarity_score',
        'title_TextBlob_subjectivity_score',
        'title_Vader_negative_score',
        'title_Vader_neutral_score',
        'title_Vader_positive_score',
        'title_Vader_compound_score',
        'text_no_characters',
        'title_no_characters',
        'character_ratio',
        'upper_case_ratio',
        'numbers_ratio',
        'vocab_richness_text',
        'vocab_richness_title',
        'typo_ratio_text',
        'typo_ratio_title']] = scaled_vals


## final call of functions 

In [22]:
'''data cleaning'''
df = Data_Cleaning(df_fake, df_true)

In [23]:
'''weekday'''
df['weekday'] = df[['date']].apply(weekday)
ohe_weekday(df['weekday'])

In [24]:
'''sentiment analysis'''
df = feature_polarity_subjectivity(df, 'text')
df = feature_vader_polarity_scores(df, 'text')

df = feature_polarity_subjectivity(df, 'title')
df = feature_vader_polarity_scores(df, 'title')

In [25]:
'''lenght of articles'''
df = no_characters_df(df, 'text')

df = no_characters_df(df, 'title')

In [26]:
'''punctuation ratio, Upper case letter ratio, numbers ratio'''
df['character_ratio'] = df['title'].apply(character_ratiorizer)
df['upper_case_ratio'] = df['title'].apply(is_upperizer)
df['numbers_ratio'] = df['title'].apply(is_digiter)

In [27]:
'''richness of vocab'''
df['vocab_richness_text'] = df['text'].apply(vocab_richnesser)

df['vocab_richness_title'] = df['title'].apply(vocab_richnesser)

In [28]:
'''typos count'''
df['preprocess_typo_text'] = df['text'].apply(preprocess_typos)
df['preprocess_typo_title'] = df['title'].apply(preprocess_typos)

df['typo_ratio_text'] = df['text'].apply(typo_ratiorizer)
df['typo_ratio_title'] = df['title'].apply(typo_ratiorizer)

In [29]:
'''drop colums'''
df = df.drop(columns=['weekday', 'preprocess_typo_text', 'preprocess_typo_title', 'subject'])

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 39
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   title                              40 non-null     object        
 1   text                               40 non-null     object        
 2   date                               40 non-null     datetime64[ns]
 3   true/false                         40 non-null     int64         
 4   true/false_description             40 non-null     object        
 5   day_friday                         40 non-null     float64       
 6   day_monday                         40 non-null     float64       
 7   day_saturday                       40 non-null     float64       
 8   day_sunday                         40 non-null     float64       
 9   day_thursday                       40 non-null     float64       
 10  day_tuesday                        40 no

In [31]:
'''scale'''
scaler(df)

In [None]:
df.info()

In [35]:
df.head()

Unnamed: 0,title,text,date,true/false,true/false_description,day_friday,day_monday,day_saturday,day_sunday,day_thursday,...,title_Vader_compound_score,text_no_characters,title_no_characters,character_ratio,upper_case_ratio,numbers_ratio,vocab_richness_text,vocab_richness_title,typo_ratio_text,typo_ratio_title
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,2017-12-31,1,fake,0.0,0.0,0.0,1.0,0.0,...,0.114975,0.51814,0.616667,0.0,0.451722,0.0,0.05179,1.0,0.014302,0.605696
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,2017-12-31,1,fake,0.0,0.0,0.0,1.0,0.0,...,0.367468,0.310676,0.45,0.0,0.368472,0.0,0.463838,1.0,0.05261,0.140097
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",2017-12-30,1,fake,0.0,0.0,1.0,0.0,0.0,...,0.397527,0.664929,0.8,0.0,0.549721,0.0,0.167259,1.0,0.0,0.950556
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",2017-12-29,1,fake,1.0,0.0,0.0,0.0,0.0,...,0.430045,0.493328,0.6,0.0,0.824581,0.0,0.225365,1.0,0.017437,0.619658
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,2017-12-25,1,fake,0.0,1.0,0.0,0.0,0.0,...,0.59974,0.404087,0.466667,0.0,0.51569,0.0,0.229181,1.0,0.03134,0.13119
