## Training data cleaning

In [1]:
import pandas as pd
import numpy as np

fake_filepath = r"C:\Users\vm658\Documents\SVSM\Research\Code\DataSet_Misinfo_FAKE.csv"
true_filepath = r"C:\Users\vm658\Documents\SVSM\Research\Code\DataSet_Misinfo_TRUE.csv"

In [2]:
fake_df = pd.read_csv(fake_filepath)
fake_df['label'] = 0
#remove first column
fake_df = fake_df.drop(fake_df.columns[0], axis=1)
#drop empty rows
fake_df = fake_df.dropna(how = 'any')
fake_df.drop_duplicates(subset = ['text'], inplace=True)

In [3]:
true_df = pd.read_csv(true_filepath, encoding = 'latin-1')
true_df['label'] = 1
#remove first column
true_df = true_df.drop(true_df.columns[0], axis=1)
#drop empty rows
true_df = true_df.dropna(how = 'any')
true_df.drop_duplicates(subset = ['text'], inplace=True)

In [4]:
df=pd.concat([true_df,fake_df])

In [5]:
import string

#add more punctuation
punctuations = string.punctuation + "’‘“”"
#remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in punctuations])
    return no_punct

In [6]:
#remove punctuation
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
#make all text lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

In [7]:
#splitting data for training and testing
import sklearn
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df['text'],df['label'],test_size=0.2, random_state = 1)


## Testing Data Cleaning

In [9]:
test_df = pd.read_csv(r"C:\Users\vm658\Documents\SVSM\Research\Code\fake_or_real_news.csv")

#relabel fake and true data as 0 and 1
test_df['label'] = test_df.label.map({'FAKE': 0, 'REAL': 1})

test_df = test_df.drop(['Unnamed: 0'], axis=1)
test_df = test_df.dropna(how = 'any')
test_df.drop_duplicates(subset = ['title'], inplace=True)

#remove punctuation
test_df['title'] = test_df['title'].apply(lambda x: remove_punctuation(x))
#make all text lowercase
#test_df['title'] = test_df['title'].apply(lambda x: x.lower())


#remove punctuation
test_df['text'] = test_df['text'].apply(lambda x: remove_punctuation(x))
#make all text lowercase
test_df['text'] = test_df['text'].apply(lambda x: x.lower())

In [10]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
import pickle

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['said', "'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would'])
stop_words.extend(['is', 'it', 'to', 'be', 'said', 'he', 'one', 'that', 'also', 'in', 'this', 'are', 'an', 'you', 'they'])
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vm658\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vm658\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
'''cmaps = {'Train_data': 'Greys',
         'Test_titles': 'GnBu',
         'Test_text': 'Wistia'}'''

def test(model, algorithm_name, x_test, y_test, cmap):
    #test the model
    prediction = model.predict(x_test)
    score = metrics.accuracy_score(y_test, prediction)
    print(f"{algorithm_name} accuracy:   %0.3f" % (score*100))
    print(f'{algorithm_name}\n',classification_report(y_test, prediction, target_names=['Fake','True']))
    cm = metrics.confusion_matrix(y_test, prediction, labels=[0,1])

    fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_test, prediction),
                                    show_absolute=True,
                                    show_normed=True,
                                    colorbar=True,
                                    cmap = cmap)
    ax.set_title(f'{algorithm_name} Confusion Matrix')
    plt.show()

## Model Training

### Multinomial Naive Bayes

In [40]:
#Multinomial NB
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer=word_tokenize, stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

MNB_model = pipe.fit(x_train, y_train)



In [41]:
#save the model
filename = 'Multinomial_NB.sav'
pickle.dump(MNB_model, open(filename, 'wb'))

### Support Vector Machine

In [43]:
#SVM
from sklearn.svm import LinearSVC
pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer=word_tokenize, stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC())
])

SVM_model = pipe.fit(x_train, y_train)



In [44]:
#save the model
filename = 'SVM.sav'
pickle.dump(SVM_model, open(filename, 'wb'))

### Logistic Regression

In [45]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer = word_tokenize, stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])

LR_model = pipe.fit(x_train, y_train)



In [46]:
#save the model
filename = 'Logistic_Regression.sav'
pickle.dump(LR_model, open(filename, 'wb'))

### Passive Aggressive Classifier

In [47]:
#Passive Aggressive Classifier
from sklearn.linear_model import PassiveAggressiveClassifier
pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer = word_tokenize, stop_words = stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf',  PassiveAggressiveClassifier())
])

PAC_model = pipe.fit(x_train, y_train)



In [48]:
#save the model
filename = 'PAC.sav'
pickle.dump(PAC_model, open(filename, 'wb'))

### Random Forest

In [51]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer = word_tokenize, stop_words = stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf',  RandomForestClassifier(random_state = 0, n_estimators = 200))
])

RF_model = pipe.fit(x_train, y_train)




In [52]:
#save the model
filename = 'Random_Forest.sav'
pickle.dump(RF_model, open(filename, 'wb'))

### Extreme Gradient Boost

In [18]:
#XG Boost
import xgboost as xg
pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range = (2,2), tokenizer = word_tokenize, stop_words = stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', xg.XGBClassifier(objective = 'binary:logistic', n_estimators = 200, seed = 123))
])

XGB_model = pipe.fit(x_train, y_train)



In [19]:

#save the model
filename = 'XG_Boost.sav'
pickle.dump(XGB_model, open(filename, 'wb'))