In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# import news data set 
path = "Data/news.csv"
news = pd.read_csv(path)

In [3]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# create length column
# function takes in data, column name and new column name, creates length column
def createLengthCol(data,data_column, new_col_name):
    split_col = data[data_column].str.split()
    print(data_column)
    # find length of list
    col_length = split_col.apply(len)
    # add to news dataframe
    data[new_col_name] = col_length
    return data

In [5]:
# create dummy variables and rename the columns appropriatly
# function takes in data, column to turn into dummy values, old col name, new col name, and columns to be dropped (can be a lsist)
def createDummy(data, dummy_col, old_name,new_name, drop_cols):
    # convert REAL values to 0, and FAKE values to 1
    data = data.join(pd.get_dummies(data[dummy_col])).drop(drop_cols,axis =1)
    #Rename column to new_name
    data = data.rename(columns={old_name:new_name})
    return data

In [6]:
# creates title length and body length columns
# function that takes in data set
def create_lengths(data):
    if "Unnamed: 0" in data.columns:
        #remove unwanted column
        news.drop(["Unnamed: 0"],axis =1,inplace = True)
    # rename columns
    data = data.rename(columns = {"title":"Article_title", "text": "Article_body"})
    #create title length
    createLengthCol(data,"Article_title","Title_length")
    #create body length
    createLengthCol(data,"Article_body","Body_length")
    return data

In [7]:
news.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [8]:
# create length variables
news = create_lengths(news)
# create dummy values, real = 0,fake  = 1
news = createDummy(news,"label","FAKE","Target",["label","REAL"])

Article_title
Article_body


In [9]:
# remove new line characters
news = news.replace(r'\n',' ', regex=True)
news.shape

(6335, 5)

In [10]:
news["Article_title"]= news["Article_title"].replace(u"\u2019","'",regex = True).replace(u"\u2018","'",regex=True).replace('\u2013', '-',regex = True)

In [11]:
news["Article_body"]= news["Article_body"].replace(u"\u2019","'",regex = True).replace(u"\u2018","'",regex=True).replace('\u2013', '-',regex = True)

In [12]:
# remove all any columns that have 0 body length
news = news[news["Body_length"] != 0 ]
news.shape

(6299, 5)

In [13]:
# remove any rows that have a title length of less than 3
news = news[news["Title_length"]>=3]
news.shape

(6270, 5)

In [14]:
# remove any rows that have the same article name
news = news.drop_duplicates(subset = "Article_title")
news.shape

(6190, 5)

In [15]:
# remove any rows that have the same article body
news = news.drop_duplicates(subset = "Article_body")
news.shape

(5983, 5)

In [16]:
news.reset_index(drop=True,inplace=True)

In [17]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
news.to_excel("Data/news_clean.xlsx", na_rep = "NAN", encoding = "utf-8-sig")

In [18]:
# # store in csv in case its needed
# news.to_csv("Data/news_clean.csv", na_rep = "NAN", encoding = "utf-8-sig")

In [19]:
# read in second data set
news2real = pd.read_csv("Data/news2real.csv")
news2fake = pd.read_csv("Data/news2fake.csv")

In [20]:
# add labels
news2real["Target"] = "0"
news2fake["Target"] = "1"

In [21]:
# join real and fake datasets
news2 = pd.concat([news2real,news2fake])

In [22]:
news2.shape

(44898, 5)

In [23]:
# remove columns that have the same text in title and text
news2 = news2[news2["title"]!= news2["text"]]

In [24]:
news2.shape

(44889, 5)

In [25]:
news2 = news2.reset_index(drop =True)

In [26]:
# remove unwanted columns
news2.drop(["date","subject"],axis =1,inplace = True)

In [27]:
# create title length and body length columns
news2 = create_lengths(news2)

Article_title
Article_body


In [28]:
news2.shape

(44889, 5)

In [29]:
# remove new line characters
news2 = news2.replace(r'\n',' ', regex=True)

In [30]:
news2["Article_title"]= news2["Article_title"].replace(u"\u2019","'",regex = True).replace(u"\u2018","'",regex=True).replace('\u2013', '-',regex = True)

In [31]:
news2["Article_body"]= news2["Article_body"].replace(u"\u2019","'",regex = True).replace(u"\u2018","'",regex=True).replace('\u2013', '-',regex = True)

In [32]:
news2.shape

(44889, 5)

In [33]:
news2.head(5)

Unnamed: 0,Article_title,Article_body,Target,Title_length,Body_length
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0,10,749
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0,9,624
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0,10,457
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0,9,376
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,11,852


In [34]:
# remove all any columns that have 0 body length
news2 = news2[news2["Body_length"] != 0 ]
news2.shape

(44258, 5)

In [35]:
# remove all any columns that have less than 50 body length
news2 = news2[news2["Body_length"] >= 50 ]
news2.shape

(42677, 5)

In [36]:
# remove any rows that have a title length of less than 3
news2 = news2[news2["Title_length"]>=3]
news2.shape

(42676, 5)

In [37]:
# found dupliacted articles 
news2.drop_duplicates(inplace=True)
news2.shape

(37440, 5)

In [38]:
# remove any rows that have the same article body (but have different slightly different title names)
news2.drop_duplicates(subset = "Article_body",inplace=True)
news2.shape

(37429, 5)

In [39]:
# remove any rows that have the same article title but different body
news2.drop_duplicates(subset = "Article_title",inplace=True)
news2.shape

(37057, 5)

In [40]:
news2.head(5)

Unnamed: 0,Article_title,Article_body,Target,Title_length,Body_length
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,0,10,749
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,0,9,624
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,0,10,457
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,0,9,376
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,0,11,852


In [41]:
news2.reset_index(drop=True,inplace=True)

In [42]:
# # store in csv in case its needed
# news2.to_csv("Data/news2.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [43]:
# join datasets together
final_news = pd.concat([news,news2])
final_news.dropna(inplace=True)

In [44]:
final_news.reset_index(drop=True,inplace=True)

In [45]:
final_news.shape

(43040, 5)

In [46]:
final_news.head(5)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target
0,You Can Smell Hillary's Fear,"Daniel Greenfield, a Shillman Journalism Fello...",5,1296,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,14,446,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,9,431,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",15,404,1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0


In [47]:
# adding new columns for analysis
final_news["Number_of_tweets"] = final_news["Article_body"].str.count('@')

In [48]:
final_news["corpus"] = final_news["Article_title"] +" " +final_news["Article_body"]

In [49]:
# remove all non ascii characters
final_news = final_news[final_news["corpus"].apply(str.isascii)]

In [50]:
# check with an example
final_news["corpus"].str.contains("کدآمایی").sum()

0

In [51]:
# remove tweet names
def remove_tweets(column):
    list_column = column.split()
    for word in list_column:
        if "@" in word:
            list_column.remove(word)
    column = " ".join(list_column)
    return column

In [52]:
final_news["corpus"] = final_news["corpus"].apply(remove_tweets)

In [53]:
final_news["Number_of_tweets"].value_counts()

0     20880
1      1489
2       702
3       346
4       199
      ...  
49        1
50        1
60        1
58        1
78        1
Name: Number_of_tweets, Length: 61, dtype: int64

In [54]:
# remove rows that have quote more than 20 tweets 
final_news = final_news[final_news["Number_of_tweets"]<20]

In [55]:
final_news.head(5)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0,0,The Battle of New York: Why This Primary Matte...
13,"Trump takes on Cruz, but lightly","Killing Obama administration rules, dismantlin...",6,17,0,0,"Trump takes on Cruz, but lightly Killing Obama..."
24,Anti-Trump forces seek last-ditch delegate revolt,Washington (CNN) The faction of the GOP that i...,6,1234,0,0,Anti-Trump forces seek last-ditch delegate rev...
29,GOP insiders: Carly crushed it,"On this day in 1973, J. Fred Buzhardt, a lawye...",5,28,0,0,GOP insiders: Carly crushed it On this day in ...
36,Donald Groped Hillary in 2005! Trump and Weine...,"Topics: anthony weiner , presidential politics...",11,259,1,0,Donald Groped Hillary in 2005! Trump and Weine...


In [56]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
final_news.to_excel("Data/final_news.xlsx",encoding = "utf-8-sig",sheet_name="Fake_news_data_set", na_rep = "NAN")

In [57]:
# # store in csv in case its needed
# final_news.to_csv("Data/final_news.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [58]:
# create testing data
testing_data = final_news.sample(frac = 0.3,random_state= 42,ignore_index =True)
testing_data.head()

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus
0,Clinton leads Trump by 12 points ahead of Repu...,NEW YORK (Reuters) - Republican presidential c...,12,353,0,0,Clinton leads Trump by 12 points ahead of Repu...
1,NEWT GINGRICH Gives Best Ever Solution to Stop...,Newt Gingrich: You start putting a few people...,11,339,1,2,NEWT GINGRICH Gives Best Ever Solution to Stop...
2,Breaking: President Trump Pardons Sheriff Joe ...,Reuters is reporting: U.S. President Donald T...,8,119,1,1,Breaking: President Trump Pardons Sheriff Joe ...
3,Barely a quarter of Catalans want to pursue sp...,MADRID (Reuters) - Barely a quarter of Catalan...,12,337,0,0,Barely a quarter of Catalans want to pursue sp...
4,Watch Morning Joe's Mika DESTROY Paul Ryan Fo...,Republican House Speaker Paul Ryan has been ge...,14,381,1,0,Watch Morning Joe's Mika DESTROY Paul Ryan For...


In [59]:
# # store in csv in case its needed
# testing_data.to_csv("Data/testing_data.csv",encoding = "utf-8-sig", na_rep = "NAN")

In [60]:
# store in excel spreadsheet as turning into csv creates unwanted characters and tableau can read it right
testing_data.to_excel("Data/testing_data.xlsx",encoding = "utf-8-sig",sheet_name="Fake_news_data_set", na_rep = "NAN")