In [None]:
from sklearn import model_selection, feature_extraction
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pickle
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier

baseread = r'./data/lemmatized'
basepath = r'./data/lemmatized'

fakerealnews = pd.read_csv(baseread+r'/fakereal_dropped36487.csv',sep=',', header=0, na_values=['?',' ','','.','NA','NAN'])
fakereallabel = pd.read_csv(baseread+r'/labels_dropped36487.csv',sep=',', header=0, na_values=['?',' ','','.','NA','NAN'])

test_size = round(44266*.25)  #11066 records and about 33034 train records
valid_size = round(44266*.75 *.005) #166 records

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    fakerealnews['document'], fakerealnews['label'], test_size=test_size, random_state=42,shuffle=True)

#do not shuffle to retain index
X_train, X_protype, y_train, y_protype = model_selection.train_test_split(
    X_train, y_train, test_size=valid_size, random_state=None,shuffle=False)

X_train.to_csv(basepath+r'/xtraincleanrealfake.csv', index=True, index_label='index')
y_train.to_csv(basepath+r'/ytraincleanrealfakelabel.csv', index=True, index_label='index')
X_test.to_csv(basepath+r'/xtestcleanrealfake.csv', index=True, index_label='index')
y_test.to_csv(basepath+r'/ytestcleanrealfakelabel.csv', index=True, index_label='index')
X_protype.to_csv(basepath+r'/xprotypecleanrealfake.csv', index=True, index_label='index')
y_protype.to_csv(basepath+r'/yprotypecleanrealfakelabel.csv', index=True, index_label='index')

In [None]:
##the below functions are for turning the documents into tfidf
stop_words = stopwords.words('english') + ['since','said','yet','thou', 'thee', 'art', 'thy', 'thine', 'ye', 'hast', 'hath', 'upon', 'unto',
                                           "'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would']

def extract_features(train, test, protype, ngram_range=(1,1), min_df=1, max_features=10000):
    '''Extract TF-IDF features from corpus'''
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer= word_tokenize,  # use the NLTK tokenizer
        stop_words=stop_words,  # remove stop words
        ngram_range=ngram_range,  #bigram (2,2)
        min_df=min_df,  # minimum document frequency, i.e. the word must appear more than once.
        max_features=max_features #top max_features ordered by term frequency
    )
    processed_train = count_vectorizer.fit_transform(train) #corpus is a list of documents
    tfid = feature_extraction.text.TfidfTransformer()
    processed_train = tfid.fit_transform(
        processed_train)
    
    processed_test = tfid.transform(count_vectorizer.transform(test))
    processed_protype = tfid.transform(count_vectorizer.transform(protype))
    return processed_train, processed_test, processed_protype

def lemmatizer(corpus):
    '''corpus:  list of documents
    return: lemmatized document list without stop words 
    and list of index of doucments that had zero length. 
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = list()
    count = -1
    ndxlist = list()
    for doc in corpus:
        count +=1
        tokenization = word_tokenize(doc)
        wordlist = list()
        for w in tokenization:
            if w not in stop_words:
                wordlist.append(wordnet_lemmatizer.lemmatize(w) + ' ')
        if len(wordlist) > 0:
            lemmatized.append("".join(wordlist).strip())
        else:
            ndxlist.append(count)
    return lemmatized, ndxlist

In [None]:
xtraincleanrealfake = pd.read_csv(basepath+r'/xtraincleanrealfake.csv',sep=',',
                                  header=0, na_values=['?',' ','','.','NA','NAN'])
xtestcleanrealfake = pd.read_csv(basepath+r'/xtestcleanrealfake.csv',sep=',',
                                  header=0, na_values=['?',' ','','.','NA','NAN'])
xprotypecleanrealfake = pd.read_csv(basepath+r'/xprotypecleanrealfake.csv',sep=',',
                                  header=0, na_values=['?',' ','','.','NA','NAN'])

In [None]:
%%time
###lemmatized the documents
xtraincleanrealfake, ndxlist = lemmatizer(xtraincleanrealfake['document'])
xtestcleanrealfake, ndxlist = lemmatizer(xtestcleanrealfake['document'])
xprotypecleanrealfake, ndxlist = lemmatizer(xprotypecleanrealfake['document'])

In [None]:
%%time
#turn the dataset into tfidf, we do NOT need to "tfidf" the label datasets
xtraincleanrealfake_tfidf, xtestcleanrealfake_tfidf, xprotypecleanrealfake_tfidf = extract_features(
    xtraincleanrealfake,
    xtestcleanrealfake,
    xprotypecleanrealfake)

In [None]:
##to save the tfidf result
import pickle
with open(basepath+r'/xtraincleanrealfake_tfidf.pkl', 'wb') as f:
    pickle.dump(xtraincleanrealfake_tfidf, f)
with open(basepath+r'/xtestcleanrealfake_tfidf.pkl', 'wb') as f:
    pickle.dump(xtestcleanrealfake_tfidf, f)
with open(basepath+r'/xprotypecleanrealfake_tfidf.pkl', 'wb') as f:
    pickle.dump(xprotypecleanrealfake_tfidf, f)

In [None]:
print(xtraincleanrealfake_tfidf.shape)
print(xtestcleanrealfake_tfidf.shape)
print(xprotypecleanrealfake_tfidf.shape)