
# DATA CLEANING  - Merge - Datetime


- Merge Dataframes
- Reset Index
- Convert date column to datetimeobjects
- Drop specific words (like Trump)


**I. Import modules**


In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from datetime import datetime

**II. Open Data set and repare true/false column and merge files**


In [2]:
data_fake = pd.read_csv("../fake_news_buster/data/Fake.csv")
data_true = pd.read_csv("../fake_news_buster/data/True.csv")

In [3]:
data_fake["true/false"] = 1
data_fake["true/false_description"] = "false"

In [4]:
data_fake.head()

Unnamed: 0,title,text,subject,date,true/false,true/false_description
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,False
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,False
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,False
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,False
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,False


In [5]:
data_true["true/false"] = 0
data_true["true/false_description"] = "true"

In [6]:
data_true.head()

Unnamed: 0,title,text,subject,date,true/false,true/false_description
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0,True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0,True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0,True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0,True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0,True


**III. Merge true and false dataset and reset Index**



In [7]:
data_old = pd.concat([data_fake, data_true])

data_2 = data_old.reset_index(drop=True)


**IV. Filter out wrong values in "date" column (containing "https..")**

In [8]:
data_2.query('date.str.contains("https")')

Unnamed: 0,title,text,subject,date,true/false,true/false_description
9358,https://100percentfedup.com/served-roy-moore-v...,https://100percentfedup.com/served-roy-moore-v...,politics,https://100percentfedup.com/served-roy-moore-v...,1,False
15507,https://100percentfedup.com/video-hillary-aske...,https://100percentfedup.com/video-hillary-aske...,politics,https://100percentfedup.com/video-hillary-aske...,1,False
15508,https://100percentfedup.com/12-yr-old-black-co...,https://100percentfedup.com/12-yr-old-black-co...,politics,https://100percentfedup.com/12-yr-old-black-co...,1,False
15839,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,politics,https://fedup.wpengine.com/wp-content/uploads/...,1,False
15840,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,politics,https://fedup.wpengine.com/wp-content/uploads/...,1,False
17432,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,Government News,https://fedup.wpengine.com/wp-content/uploads/...,1,False
17433,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,Government News,https://fedup.wpengine.com/wp-content/uploads/...,1,False
21869,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,left-news,https://fedup.wpengine.com/wp-content/uploads/...,1,False
21870,https://fedup.wpengine.com/wp-content/uploads/...,https://fedup.wpengine.com/wp-content/uploads/...,left-news,https://fedup.wpengine.com/wp-content/uploads/...,1,False


In [9]:
data_2.query('date.str.contains("https")').index

Int64Index([9358, 15507, 15508, 15839, 15840, 17432, 17433, 21869, 21870], dtype='int64')

In [10]:
#Drop the rows containing "only" webadresses instead of text, title, subject

list_indexes_to_drop = data_2.query('date.str.contains("https")').index

              
data = data_2.drop(data_2.index[list_indexes_to_drop])

In [11]:
print(data_2.shape)
print(data.shape)
data.tail()

(44898, 6)
(44889, 6)


Unnamed: 0,title,text,subject,date,true/false,true/false_description
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0,True
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0,True
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0,True
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0,True
44897,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",0,True


**V. Parse through "date" column and convert to datetime object**

In [12]:
def try_parsing_date(text):
    for fmt in ('%d-%b-%y', '%B %d, %Y', '%b %d, %Y','%b %d, %Y ','%B %d, %Y '):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
        
    return np.nan

In [13]:
data["date"] = data["date"].map(try_parsing_date)

In [14]:
data.isnull().sum()

title                     0
text                      0
subject                   0
date                      1
true/false                0
true/false_description    0
dtype: int64

In [15]:
data.shape

(44889, 6)

**VI. Final Dataframe: 1)merged 2) deleted 10 rows in date column 3)converted dates to datetimeobjects**

*Final Dataframe = data*

In [16]:
#data[~data["date"].str.startswith(("Feb", "Dec", "Jan", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "19", "18", "17", "16", "15"))]

**VII. Drop Words function**


- helper function to loop through a words list, to use with apply
- main function calls the helper funct. --> uses lambda to specify which is x and which is the other variable to use within helper-fuct (because apply only takes 1 argument (x))


In [18]:
data[data["text"].str.contains("Trump", "TRUMP")].count()

title                     22154
text                      22154
subject                   22154
date                      22154
true/false                22154
true/false_description    22154
dtype: int64

In [21]:
data['text'][0].count("Trump")

11

In [99]:
def Drop_words_helper_fct(x, word_list):
    for word in word_list:
        x = x.replace(word, "")
    return x


In [100]:
def Drop_words(df, column_name, word_list):
    new_column_name = f'cleaned_{column_name}'
    df[new_column_name] = df[column_name].apply(lambda x: Drop_words_helper_fct(x,word_list))
    
    


In [101]:
df_3 = data.loc[[0,1]]
df_3

Unnamed: 0,title,text,subject,date,true/false,true/false_description
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,1,False
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,1,False


In [108]:
Drop_words(df_3, "title", ["Trump", "Donald"])

In [109]:
df_3

Unnamed: 0,title,text,subject,date,true/false,true/false_description,cleaned_text,cleaned_title
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,1,False,Donald couldn t all Americans a Happy New Y...,Sends Out Embarrassing New Year’s Eve Messa...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,1,False,House Intelligence Committee Chairman Devin Nu...,Drunk Bragging Staffer Started Russian Collu...
