In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# import news data set 
path = "Data/news.csv"
news = pd.read_csv(path)

In [3]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# create length column
# function takes in data, column name and new column name, creates length column
def createLengthCol(data,data_column, new_col_name):
    split_col = data[data_column].str.split()
    print(data_column)
    # find length of list
    col_length = split_col.apply(len)
    # add to news dataframe
    data[new_col_name] = col_length
    return data

In [5]:
# create dummy variables and rename the columns appropriatly
# function takes in data, column to turn into dummy values, old col name, new col name, and columns to be dropped (can be a lsist)
def createDummy(data, dummy_col, old_name,new_name, drop_cols):
    # convert REAL values to 0, and FAKE values to 1
    data = data.join(pd.get_dummies(data[dummy_col])).drop(drop_cols,axis =1)
    #Rename column to new_name
    data = data.rename(columns={old_name:new_name})
    return data

In [6]:
# creates title length and body length columns
# function that takes in data set
def create_lengths(data):
    if "Unnamed: 0" in data.columns:
        #remove unwanted column
        news.drop(["Unnamed: 0"],axis =1,inplace = True)
    # rename columns
    data = data.rename(columns = {"title":"Article_title", "text": "Article_body"})
    #create title length
    createLengthCol(data,"Article_title","Title_length")
    #create body length
    createLengthCol(data,"Article_body","Body_length")
    return data

In [7]:
news.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [8]:
# create length variables
news = create_lengths(news)
# create dummy values, real = 0,fake  = 1
news = createDummy(news,"label","FAKE","Target",["label","REAL"])

Article_title
Article_body


In [9]:
# remove new line characters
news = news.replace(r'\n',' ', regex=True)
news.shape

(6335, 5)

In [10]:
# remove all any columns that have 0 body length
news = news[news["Body_length"] != 0 ]
news.shape

(6299, 5)

In [11]:
# remove any rows that have a title length of less than 3
news = news[news["Title_length"]>=3]
news.shape

(6270, 5)

In [12]:
# remove any rows that have the same article name
news = news.drop_duplicates(subset = "Article_title")
news.shape

(6191, 5)

In [13]:
# remove any rows that have the same article body
news = news.drop_duplicates(subset = "Article_body")
news.shape

(5983, 5)

In [14]:
news.reset_index(drop=True,inplace=True)

In [15]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
news.to_excel("Data/news_clean.xlsx", na_rep = "NAN", encoding = "utf-8-sig")

In [16]:
# # store in csv in case its needed
# news.to_csv("Data/news_clean.csv", na_rep = "NAN", encoding = "utf-8-sig")

In [17]:
# read in second data set
news2real = pd.read_csv("Data/news2real.csv")
news2fake = pd.read_csv("Data/news2fake.csv")

In [18]:
# add labels
news2real["Target"] = "0"
news2fake["Target"] = "1"

In [19]:
# join real and fake datasets
news2 = pd.concat([news2real,news2fake])

In [20]:
news2 = news2.reset_index(drop =True)

In [21]:
# remove unwanted columns
news2.drop(["date","subject"],axis =1,inplace = True)

In [22]:
news2 = create_lengths(news2)

Article_title
Article_body


In [23]:
news2.shape

(44898, 5)

In [24]:
# remove new line characters
news2 = news2.replace(r'\n',' ', regex=True)
news2.shape

(44898, 5)

In [25]:
# remove all any columns that have 0 body length
news2 = news2[news2["Body_length"] != 0 ]
news2.shape

(44267, 5)

In [26]:
# remove all any columns that have less than 50 body length
news2 = news2[news2["Body_length"] >= 50 ]
news2.shape

(42677, 5)

In [27]:
# remove any rows that have a title length of less than 3
news2 = news2[news2["Title_length"]>=3]
news2.shape

(42676, 5)

In [28]:
# found dupliacted articles 
news2.drop_duplicates(inplace=True)
news2.shape

(37440, 5)

In [29]:
# remove any rows that have the same article body (but have different slightly different title names)
news2.drop_duplicates(subset = "Article_body",inplace=True)
news2.shape

(37429, 5)

In [30]:
# remove any rows that have the same article title but different body
news2.drop_duplicates(subset = "Article_title",inplace=True)
news2.shape

(37057, 5)

In [31]:
news2.head(5)

Unnamed: 0,Article_title,Article_body,Target,Title_length,Body_length
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0,10,749
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0,9,624
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0,10,457
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0,9,376
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,11,852


In [32]:
news2.reset_index(drop=True,inplace=True)

In [34]:
# # store in csv in case its needed
# news2.to_csv("Data/news2.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [35]:
# join datasets together
final_news = pd.concat([news,news2])

In [36]:
final_news.reset_index(drop=True,inplace=True)

In [37]:
final_news.shape

(43040, 5)

In [40]:
final_news.head(5)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",5,1296,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,14,446,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,9,431,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",15,404,1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0


In [44]:
# look at amount of quoted tweets as fake news articles may quote them more
# count emotive words/ swear words/ non professional words (fake news may use these more)
# see what categories fake news tends to cover the most
# create word list to look for (politicians names)
# when user enters title/body make sure we remove stop words/ puncuation and make everything lower case

In [112]:
# adding new columns for analysis
final_news["Number_of_tweets"] = final_news["Article_body"].str.count('@')

In [113]:
final_news.head(5)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",5,1296,1,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,14,446,1,3
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,9,431,0,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",15,404,1,6
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0,0


In [None]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
final_news.to_excel("Data/final_news.xlsx",encoding = "utf-8-sig",sheet_name="Fake_news_data_set", na_rep = "NAN")

In [None]:
# # store in csv in case its needed
# final_news.to_csv("Data/final_news.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [None]:
# create testing data
testing_data = final_news.sample(frac = 0.3,random_state= 42,ignore_index =True)
testing_data.head()

In [None]:
# # store in csv in case its needed
# testing_data.to_csv("Data/testing_data.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [None]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
testing_data.to_excel("Data/testing_data.xlsx",encoding = "utf-8-sig",sheet_name="Fake_news_data_set", na_rep = "NAN")