In [101]:
import pandas as pd
import numpy as np
import pickle
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package wordnet to /Users/kosta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /Users/kosta/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kosta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
words = list(nltk.corpus.words.words())
words = [word.lower() for word in words]
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

In [103]:
cd ..

/Users/kosta/github-tests/fake_news


In [104]:
cd Data

/Users/kosta/github-tests/fake_news/Data


In [105]:
ls

Fake.csv         True.csv         dtm.pickle.gzde  [34mpickled_data[m[m/


In [106]:
df_real = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

### Getting familiar with the data

In [107]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [108]:
df_fake.dtypes

title      object
text       object
subject    object
date       object
dtype: object

In [109]:
df_fake['date'] = pd.to_datetime(df_fake['date'], errors='coerce')
df_fake = df_fake.dropna(subset=['date'])

In [110]:
df_fake.shape

(23471, 4)

In [111]:
df_fake['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [112]:
df_fake.nunique()

title      17897
text       17449
subject        6
date        1010
dtype: int64

In [113]:
df_real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [114]:
df_real.dtypes

title      object
text       object
subject    object
date       object
dtype: object

In [115]:
df_real['date'] = pd.to_datetime(df_real['date'], errors='coerce')
df_real = df_real.dropna(subset=['date'])

In [116]:
df_real.shape

(21417, 4)

In [117]:
df_real.nunique()

title      20826
text       21192
subject        2
date         716
dtype: int64

In [118]:
df_real['subject'].unique()

array(['politicsNews', 'worldnews'], dtype=object)

Data cleaning:

0. Remove empty values
1. Remove duplicates
2. Merge tables
3. Make text all lower case
4. Remove punctuation, numerical values, and non-sensical text
5. Remove stop words, and Tokenize text

Future ideas:
6. Stemming / lemmatization
7. Deal with typos

### 0. Remove empty values

In [119]:
df_real.isna().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [120]:
df_real = df_real.replace(" ", np.nan)

In [121]:
df_real = df_real.dropna()

In [122]:
df_fake.isna().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [123]:
df_fake = df_fake.replace(" ", np.nan)

In [124]:
df_fake = df_fake.dropna()

### 1. Remove duplicates

In [125]:
df_fake['title'].duplicated().value_counts()

False    17453
True      5392
Name: title, dtype: int64

In [126]:
df_fake['text'].duplicated().value_counts()

False    17448
True      5397
Name: text, dtype: int64

In [127]:
df_fake = df_fake.drop_duplicates(subset=['text'])
df_fake = df_fake.drop_duplicates(subset=['title'])

In [128]:
df_real['text'].duplicated().value_counts()

False    21191
True       225
Name: text, dtype: int64

In [129]:
df_real['title'].duplicated().value_counts()

False    20825
True       591
Name: title, dtype: int64

In [130]:
df_real = df_real.drop_duplicates(subset=['text'])
df_real = df_real.drop_duplicates(subset=['title'])

### 2. Merge tables

In [131]:
df_real['status'] = 1

In [132]:
df_fake['status'] = 0

In [133]:
to_merge = [df_real, df_fake]

In [134]:
df = pd.concat(to_merge)

In [135]:
df.head()

Unnamed: 0,title,text,subject,date,status
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29,1


In [136]:
df.shape

(38263, 5)

### 3. Make text all lower case

In [137]:
df['text'] = df['text'].str.lower()

In [138]:
df['title'] = df['title'].str.lower()

### 4. Remove punctuation, numerical values, and non-sensical text

In [139]:
test = df.iloc[15001,1]

In [140]:
test

'aden (reuters) - the saudi-led military coalition fighting the houthi movement in yemen has allowed work to resume at the southern port of aden, two days after ordering a nationwide stoppage, an official there said on wednesday.  the coalition said on monday it would close all air, land and sea ports in yemen to stem the flow of arms to the houthis from iran.  we were officially notified by the coalition this afternoon that the closure will be lifted and work will resume as normal,  the official, who declined to be named, said. he did not give any reasons, though aden is largely controlled by the coalition. '

In [141]:
def clean_df_2(test):
    # Stop words 1
    test_list = test.split() # Convert to list
    test_list = [w for w in test_list if not w in stop_words] # Remove stop words before symbols are removed
    test = " ".join(str(i) for i in test_list) # Convert to string
    # General cleaning
    test = test.lower() # Turns to lowercase
    test = test.split('(reuters) - ')[-1] # Removes the "seattle/washington (reuters) - " in the begining
    test = re.sub('-', ' ', test) # Replaces dashes with spaces
    test = re.sub('/', ' ', test) # Replaces slashes with spaces
    test = test.replace(u'\xa0', u' ') # Removing "\xa0" in the text
    test = re.sub('\n', '', test) # Removes '\n' sighs
    test = re.sub('[%s]' % re.escape(string.punctuation), '', test) # Removes punctuation
    test = re.sub('[‘’“”…]', '', test) # Removes '[‘’“”…]' symbols
    test = re.sub('\w*\d\w*', '', test) # Removes numbers and words containing numbers
    test = re.sub("theyre" , "", test) # removes "they are" stop word
    test = re.sub("\s\s+" , " ", test) # removes multiple spaces
    test = test.rstrip(' ') # removes spaces at the end of a string
    # Non-English words
    test = " ".join(w for w in nltk.wordpunct_tokenize(test) if w in words or not w.isalpha()) # Remove non-English words
    # Stop words 2
    test_list = test.split() # Convert to list
    test_list = [w for w in test_list if not w in stop_words] # Remove stop words
    test = " ".join(str(i) for i in test_list) # Convert to string
    
    return test

In [142]:
round2 = lambda x: clean_df_2(x)

In [145]:
import time
start = time.time()
df['text'] = pd.DataFrame(df.text.apply(round2))
end = time.time()
print(end - start)

14440.182802915573


In [149]:
df_ps = df.copy()

In [151]:
df_ps.to_pickle("df_pre_steem.pkl")

In [163]:
df_real_ps = df_ps.loc[df_ps["status"] == 1]
df_fake_ps = df_ps.loc[df_ps["status"] == 0]

In [164]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
data_cv = cv.fit_transform(df_ps.text)
data_ps_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_ps_dtm.index = df_ps.index
data_ps_dtm

data_cv = cv.fit_transform(df_real_ps.text)
data_real_ps_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_real_ps_dtm.index = df_real_ps.index

data_cv = cv.fit_transform(df_fake_ps.text)
data_fake_ps_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_fake_ps_dtm.index = df_fake_ps.index

In [170]:
#data_ps_dtm.to_pickle("data_ps_dtm.pkl")

In [169]:
data_real_ps_dtm

Unnamed: 0,aa,aardvark,aaron,ab,aba,aback,abacus,abandon,abandoned,abandonment,...,zippy,zircon,zloty,zoa,zombie,zone,zoning,zoo,zoom,zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21411,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21413,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21414,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
data_fake_ps_dtm.max().sort_values(ascending=False).head(30)

sentence        122
county          119
cocaine          98
distribute       98
intent           80
target           78
group            76
president        74
first            74
source           72
state            70
trump            70
would            69
rate             68
shooting         68
clinton          66
imprisonment     66
offense          65
organization     65
release          64
district         63
ramsey           63
las              63
expire           61
commutation      61
grant            61
prison           61
us               60
july             56
story            56
dtype: int64

In [154]:
data_real_ps_dtm.max().sort_values(ascending=False).head(30)

sentence        122
county          119
distribute       98
cocaine          98
intent           80
target           78
group            76
president        74
first            74
source           72
state            70
trump            70
would            69
shooting         68
rate             68
imprisonment     66
clinton          66
offense          65
organization     65
release          64
las              63
ramsey           63
district         63
prison           61
commutation      61
grant            61
expire           61
us               60
story            56
july             56
dtype: int64

In [88]:
#def clean_df(test):
#    # Stop words 1
#    test_list = test.split() # Convert to list
#    test_list = [w for w in test_list if not w in stop_words] # Remove stop words before symbols are removed
#    test = " ".join(str(i) for i in test_list) # Convert to string
#    # General cleaning
#    test = test.lower() # Turns to lowercase
#    test = test.split('(reuters) - ')[-1] # Removes the "seattle/washington (reuters) - " in the begining
#    test = re.sub('-', ' ', test) # Replaces dashes with spaces
#    test = re.sub('/', ' ', test) # Replaces slashes with spaces
#    test = test.replace(u'\xa0', u' ') # Removing "\xa0" in the text
#    test = re.sub('\n', '', test) # Removes '\n' sighs
#    test = re.sub('[%s]' % re.escape(string.punctuation), '', test) # Removes punctuation
#    test = re.sub('[‘’“”…]', '', test) # Removes '[‘’“”…]' symbols
#    test = re.sub('\w*\d\w*', '', test) # Removes numbers and words containing numbers
#    test = re.sub("theyre" , "", test) # removes "they are" stop word
#    test = re.sub("\s\s+" , " ", test) # removes multiple spaces
#    test = test.rstrip(' ') # removes spaces at the end of a string
#    # Non-English words
#    test = " ".join(w for w in nltk.wordpunct_tokenize(test) if w in words or not w.isalpha()) # Remove non-English words
#    # Stop words 2
#    test_list = test.split() # Convert to list
#    test_list = [w for w in test_list if not w in stop_words] # Remove stop words
#    test = " ".join(str(i) for i in test_list) # Convert to string
#    # Stemming
#    test_list = test.split() # Convert to list
#    test_list = [porter.stem(word) for word in test_list] # Stemming the data
#    test = " ".join(str(i) for i in test_list) # Convert to string
#    
#    return test

In [89]:
#round1 = lambda x: clean_df(x)

In [90]:
#df['text'] = pd.DataFrame(df.text.apply(round1))

In [93]:
#df.to_pickle("pickled_data/df_clean.pkl")

In [94]:
df.iloc[15001,1]

'militari coalit fight movement yemen work resum southern port two day nationwid stoppag offici said wednesday coalit said monday would close air land sea yemen stem flow arm iran offici notifi coalit afternoon closur work resum normal offici declin said give though larg coalit'

### Pickling the data:

In [53]:
#df.to_pickle("pickled_data/df.pkl")

In [54]:
#df_real.to_pickle("pickled_data/df_real.pkl")

In [55]:
#df_fake.to_pickle("pickled_data/df_fake.pkl")

In [56]:
#data_dtm.to_pickle('dtm.pickle.gzde', compression='gzip')

### Final cleaning touches

In [95]:
df["length_of_text"] = [len(text.split()) for text in df["text"]]

In [96]:
df = df.reset_index(drop=True)

In [97]:
df = df.loc[df["length_of_text"] > 50]

In [99]:
df = df.reset_index(drop=True)

In [1]:
df

NameError: name 'df' is not defined