In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Binary classification using article titles
### (Real vs. Fake)
- Establish pipelines
    - Count Vectorizer —> Logistic Regression
    - TF-IDF Vectorizer —> Logistic Regression
- Grid Search to optimize hyperparameters
- Instantiate model for statistical inference (assessing feature coefficients)

In [2]:
# Reading in title DataFrame
title_df = pd.read_csv('./data/prediction_data/binary_title_df.csv')

# Renaming '0' column to 'title'
title_df.rename(columns={'0':'title', 'label':'label'}, inplace=True)

print(title_df.shape)
title_df.head()

(6000, 2)


Unnamed: 0,title,label
0,Taking a frigid plunge during a Siberian winter,0
1,Aleppo evacuations resume after brief delay,0
2,"Review: In ‘24: Legacy,’ Jack Bauer’s Gone. Th...",0
3,‘Let This Woman’s Brave Statement Peel the Bli...,0
4,"Trump’s new Treasury, Commerce nominees say no...",0


In [3]:
# Using titles for binary classification

# Setting up X and y
X = title_df['title']
y = title_df['label']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Establishing pipeline for Count Vectorizer
pipe_cvec = Pipeline([('cvec', CountVectorizer()),
                      ('lr', LogisticRegression())
                     ])
# Establishing pipeline for TF-IDF Vectorizer
pipe_tvec = Pipeline([('tvec', TfidfVectorizer()),
                      ('lr', LogisticRegression())
                     ])

### Grid Search using Count Vectorizer for binary classification (article title)

In [5]:
# Establishing Count Vectorizer parameter map for Grid Search
cvec_params = {
    'cvec__max_features': [350, 500, 650],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.6, .7],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__stop_words': [None, 'english'],
    'cvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_cvec_titles = GridSearchCV(pipe_cvec, cvec_params, cv=3)
gs_cvec_titles.fit(X_train, y_train)



































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [6]:
print(gs_cvec_titles.best_params_)
print(f"Training accuracy: {gs_cvec_titles.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_cvec_titles.score(X_test, y_test)}")

{'cvec__lowercase': False, 'cvec__max_df': 0.6, 'cvec__max_features': 650, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training accuracy: 0.7753333333333333
Testing accuracy: 0.6986666666666667


### Grid Search using TF-IDF vectorizer for binary classification (article title)

In [7]:
# Establishing TF-IDF Vectorizer parameter map for Grid Search
tvec_params = {
    'tvec__max_features': [350, 500, 650],
    'tvec__min_df': [1, 2, 3],
    'tvec__max_df': [.6,.7],
    'tvec__ngram_range': [(1,1),(1,2),(1,3)],
    'tvec__stop_words': [None, 'english'],
    'tvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_tvec_titles = GridSearchCV(pipe_tvec, tvec_params, cv=3)
gs_tvec_titles.fit(X_train, y_train)



































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [8]:
print(gs_tvec_titles.best_params_)
print(f"Training accuracy: {gs_tvec_titles.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_tvec_titles.score(X_test, y_test)}")

{'tvec__lowercase': False, 'tvec__max_df': 0.6, 'tvec__max_features': 650, 'tvec__min_df': 1, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}
Training accuracy: 0.7813333333333333
Testing accuracy: 0.722


### Statistical inference
Grid Search outputs optimal model configuration (TF-IDF vectorizer; gs_tvec.best_params_), but in order to examine the confusion matrix or assess feature coefficients, we must instantiate the model with configurations output from Grid Search.

#### Examining confusion matrix
Using confusion matrix to calculate accuracy, misclassification, sensitivity, and precision.
#### Assessing feature coefficients for multiclass classification using article titles
Comparing correlation of various features with spectrum of source bias.

In [9]:
tvec = TfidfVectorizer(stop_words=None, 
                       max_df=0.6,
                       min_df=1, 
                       ngram_range=(1,1),
                       lowercase=False,
                       max_features=650)

X_train_vec = tvec.fit_transform(X_train)
X_test_vec = tvec.transform(X_test)

lr = LogisticRegression()
model_titles = lr.fit(X_train_vec, y_train)



In [11]:
cm = pd.DataFrame(data=confusion_matrix(y_test, model_titles.predict(X_test_vec)),
                  index=['Actual Real News',
                         'Actual Fake News'],
                  columns=['Predicted Real News',
                           'Predicted Fake News'])

sensitivity = recall_score(y_test, model_titles.predict(X_test_vec), average='macro')
precision = precision_score(y_test, model_titles.predict(X_test_vec), average='macro')
specificity = cm['Predicted Real News']['Actual Real News'] / (cm['Predicted Real News']['Actual Real News'] + cm['Predicted Fake News']['Actual Real News'])

print(f'Accuracy score: {round(model_titles.score(X_test_vec, y_test),3)}')
print(f'Misclassification rate: {round(1-model_titles.score(X_test_vec, y_test), 3)}')
print(f'Sensitivity score: {round(sensitivity,3)}')
print(f'Specificity score: {round(specificity,3)}')
print(f'Precision score: {round(precision,3)}')
cm

Accuracy score: 0.722
Misclassification rate: 0.278
Sensitivity score: 0.722
Specificity score: 0.73
Precision score: 0.722


Unnamed: 0,Predicted Real News,Predicted Fake News
Actual Real News,561,208
Actual Fake News,209,522


In [12]:
coef_titles = pd.DataFrame(model_titles.coef_, columns=tvec.get_feature_names()).T
print('Terms highly and positively correlated with fake sources:')
print(coef_titles.sort_values(by=0, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with real sources:')
print(coef_titles.sort_values(by=0, ascending=True).head(10))

Terms highly and positively correlated with fake sources:
                 0
Comment   3.655646
FBI       2.599875
Election  2.565177
BREAKING  2.465033
Hillary   2.430882
Re        2.355321
And       2.270423
Syria     1.982700
IN        1.953516
War       1.809067

Terms highly and negatively correlated with real sources:
                 0
GOP      -2.627008
Cruz     -2.500927
Atlantic -2.304479
review   -1.789447
Rubio    -1.780425
police   -1.710988
says     -1.710490
Attack   -1.695039
House    -1.663450
Sanders  -1.663070


# Binary classification using article bodies
### (Real vs. Fake)
- Grid Search to optimize hyperparameters
- Instantiate model for statistical inference

In [13]:
# Reading in title DataFrame
body_df = pd.read_csv('./data/prediction_data/binary_body_df.csv')

# Renaming '0' column to 'title'
body_df.rename(columns={'0':'body', 'label':'label'}, inplace=True)

print(body_df.shape)
body_df.head()

(600, 2)


Unnamed: 0,body,label
0,"People of all ages gather in Krasnoyarsk,...",0
1,The latest evacuations from eastern Aleppo re...,0
2,"Until the Trump presidency became a reality, t...",0
3,Here are the top 10 comments of the week on ou...,0
4,Donald Trump’s nominee to lead the ...,0


In [15]:
# Using bodies for binary classification

# Setting up X and y
X = body_df['body']
y = body_df['label']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Grid Search using Count Vectorizer for binary classification (article body)

In [17]:
# Establishing Count Vectorizer parameter map for Grid Search
cvec_params = {
    'cvec__max_features': [350, 500, 650],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.6, .7],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__stop_words': [None, 'english'],
    'cvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_cvec_bodies = GridSearchCV(pipe_cvec, cvec_params, cv=3)
gs_cvec_bodies.fit(X_train, y_train)





































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [18]:
print(gs_cvec_bodies.best_params_)
print(f"Training accuracy: {gs_cvec_bodies.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_cvec_bodies.score(X_test, y_test)}")

{'cvec__lowercase': False, 'cvec__max_df': 0.6, 'cvec__max_features': 650, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Training accuracy: 0.9977777777777778
Testing accuracy: 0.78


### Grid Search using TF-IDF Vectorizer for binary classification (article body)

In [19]:
# Establishing TF-IDF Vectorizer parameter map for Grid Search
tvec_params = {
    'tvec__max_features': [350, 500, 650],
    'tvec__min_df': [1, 2, 3],
    'tvec__max_df': [.6,.7],
    'tvec__ngram_range': [(1,1),(1,2),(1,3)],
    'tvec__stop_words': [None, 'english'],
    'tvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_tvec = GridSearchCV(pipe_tvec, tvec_params, cv=3)
gs_tvec.fit(X_train, y_train)





































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [20]:
print(gs_tvec.best_params_)
print(f"Training accuracy: {gs_tvec.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_tvec.score(X_test, y_test)}")

{'tvec__lowercase': True, 'tvec__max_df': 0.6, 'tvec__max_features': 650, 'tvec__min_df': 1, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': None}
Training accuracy: 0.8888888888888888
Testing accuracy: 0.8


### Statistical inference
Grid Search outputs optimal model configuration (TF-IDF vectorizer; gs_tvec.best_params_), but in order to examine the confusion matrix or assess feature coefficients, we must instantiate the model with configurations output from Grid Search.

#### Examining confusion matrix
Using confusion matrix to calculate accuracy, misclassification, sensitivity, and precision.
#### Assessing feature coefficients for multiclass classification using article titles
Comparing correlation of various features with spectrum of source bias.

In [21]:
tvec = TfidfVectorizer(stop_words=None, 
                       max_df=0.6,
                       min_df=1, 
                       ngram_range=(1,2),
                       lowercase=True,
                       max_features=650)

X_train_vec = tvec.fit_transform(X_train)
X_test_vec = tvec.transform(X_test)

lr = LogisticRegression()
model = lr.fit(X_train_vec, y_train)



In [30]:
cm = pd.DataFrame(data=confusion_matrix(y_test, model.predict(X_test_vec)),
                  index=['Actual Real News',
                         'Actual Fake News'],
                  columns=['Predicted Real News',
                           'Predicted Fake News'])

sensitivity = recall_score(y_test, model.predict(X_test_vec), average='macro')
precision = cm['Predicted Fake News']['Actual Fake News'] / (cm['Predicted Fake News']['Actual Fake News'] + cm['Predicted Fake News']['Actual Real News'])
specificity = cm['Predicted Real News']['Actual Real News'] / (cm['Predicted Real News']['Actual Real News'] + cm['Predicted Fake News']['Actual Real News'])

print(f'Accuracy score: {round(model.score(X_test_vec, y_test),3)}')
print(f'Misclassification rate: {round(1-model.score(X_test_vec, y_test), 3)}')
print(f'Sensitivity score: {round(sensitivity,3)}')
print(f'Specificity score: {round(specificity,3)}')
print(f'Precision score: {round(precision,3)}')
cm

Accuracy score: 0.8
Misclassification rate: 0.2
Sensitivity score: 0.803
Specificity score: 0.768
Precision score: 0.75


Unnamed: 0,Predicted Real News,Predicted Fake News
Actual Real News,63,19
Actual Fake News,11,57


In [32]:
coef = pd.DataFrame(model.coef_, columns=tvec.get_feature_names()).T
print('Terms highly and positively correlated with fake sources:')
print(coef.sort_values(by=0, ascending=False).head(10))
print()
print('Terms highly and positively correlated with real sources:')
print(coef.sort_values(by=0, ascending=True).head(10))

Terms highly and positively correlated with fake sources:
                 0
hillary   1.402551
russia    1.174578
war       1.049599
2016      0.996690
на        0.995922
pipeline  0.966120
october   0.931956
the us    0.764632
source    0.748704
russian   0.733536

Terms highly and positively correlated with real sources:
                  0
said      -2.392957
his       -1.500128
team      -1.279352
mr        -1.103506
last      -1.053808
more      -0.947087
trump     -0.946599
president -0.902754
speech    -0.872641
cruz      -0.862645
