In [1]:
#Data Container import
from class_DataContainer import DataContainer

#Data Preprocessing imports
import pandas as pd
import numpy as np
import string

#Classification imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

#Plotting imports
import matplotlib.pyplot as plt

In [2]:
p = {'true': './True.csv', 'fake': './Fake.csv'}
data = DataContainer(paths=p)
    
outlier = ['https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg',
'https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg',
'https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/',
'https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/',
'MSNBC HOST Rudely Assumes Steel Worker Would Never Let His Son Follow in His Footsteps…He Couldn’t Be More Wrong [Video]',
'https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/']

for date in data.data["date"]:
   if date in outlier:
       data.data.drop(data.data[data.data["date"] == date].index, inplace=True)

In [3]:
#Feature Engineering
# len str
# count upper
# count lower
# count punctuation
# subject encode

data.data['length'] = data.data['text'].apply(len)
data.data['length_title'] = data.data['title'].apply(len)

data.data['upper'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data.data['upper_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

data.data['lower'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x.islower()]))
data.data['lower_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x.islower()]))

data.data['punctuation'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
data.data['punctuation_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))

data.data['exclamation'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x == '!']))
data.data['exclamation_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x == '!']))

data.data['question'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x == '?']))
data.data['question_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x == '?']))

data.data['numeric'] = data.data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data.data['numeric_title'] = data.data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

data.data['unique_words'] = data.data['text'].apply(lambda x: len(set(x.split())))
data.data['unique_words_title'] = data.data['title'].apply(lambda x: len(set(x.split())))

data.categorical('subject')
data.encode_categorical()

data.date('date') #Date data has varying formats, datetime conversion may mix day and month.

data.optimize_int()
data.optimize_float()


data.info

<class 'pandas.core.frame.DataFrame'>
Index: 44888 entries, 0 to 44897
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   title               44888 non-null  object        
 1   text                44888 non-null  object        
 2   subject             44888 non-null  int8          
 3   date                44888 non-null  datetime64[ns]
 4   label               44888 non-null  int8          
 5   length              44888 non-null  int32         
 6   length_title        44888 non-null  int16         
 7   upper               44888 non-null  int16         
 8   upper_title         44888 non-null  int8          
 9   lower               44888 non-null  int16         
 10  lower_title         44888 non-null  int8          
 11  punctuation         44888 non-null  int8          
 12  punctuation_title   44888 non-null  int8          
 13  exclamation         44888 non-null  int8          


In [4]:
data.feature_split(label=['label'], exclude=['date', 'text', 'title'])

X_train, X_test, y_train, y_test = train_test_split(data.feature, data.label, test_size=0.2, random_state=99)

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy', 'log_loss'],
    'n_jobs' : [-1]
}
gs = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
gs.fit(X_train, y_train.values.ravel())
gs.best_params_

{'criterion': 'log_loss',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 100,
 'n_jobs': -1}

In [5]:
model.set_params(**gs.best_params_)
model.fit(X_train, y_train.values.ravel())
model.score(X_test, y_test)

0.9997772332368011

In [8]:
pred = model.predict(X_test)
