In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Multiclass classification using article titles
### (Establishment vs. Rightwing vs. Leftwing vs. Fake)
- Establish pipelines
    - Count Vectorizer —> Logistic Regression
    - TF-IDF Vectorizer —> Logistic Regression
- Grid Search to optimize hyperparameters
- Instantiate model for statistical inference

In [2]:
# Reading in title DataFrame
title_df = pd.read_csv('./data/prediction_data/title_data.csv')

# Renaming '0' column to 'title'
title_df.rename(columns={'0':'title', 'label':'label'}, inplace=True)

print(title_df.shape)
title_df.head()

(6000, 2)


Unnamed: 0,title,label
0,Taking a frigid plunge during a Siberian winter,1
1,Aleppo evacuations resume after brief delay,1
2,"Review: In ‘24: Legacy,’ Jack Bauer’s Gone. Th...",1
3,‘Let This Woman’s Brave Statement Peel the Bli...,1
4,"Trump’s new Treasury, Commerce nominees say no...",1


In [3]:
# Using titles for multiclass classification

# Setting up X and y
X = title_df['title']
y = title_df['label']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Establishing pipeline for Count Vectorizer
pipe_cvec = Pipeline([('cvec', CountVectorizer()),
                      ('lr', LogisticRegression())
                     ])
# Establishing pipeline for TF-IDF Vectorizer
pipe_tvec = Pipeline([('tvec', TfidfVectorizer()),
                      ('lr', LogisticRegression())
                     ])

### Grid Search using Count Vectorizer for multiclass classification

In [5]:
# Establishing Count Vectorizer parameter map for Grid Search
cvec_params = {
    'cvec__max_features': [350, 500, 650],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.6, .7],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__stop_words': [None, 'english'],
    'cvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_cvec = GridSearchCV(pipe_cvec, cvec_params, cv=3)
gs_cvec.fit(X_train, y_train)









































































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [6]:
print(gs_cvec.best_params_)
print(f"Training accuracy: {gs_cvec.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_cvec.score(X_test, y_test)}")

{'cvec__lowercase': False, 'cvec__max_df': 0.6, 'cvec__max_features': 500, 'cvec__min_df': 1, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}
Training accuracy: 0.6015555555555555
Testing accuracy: 0.4726666666666667


### Grid Search using TF-IDF Vectorizer for multiclass classification

In [7]:
# Establishing TF-IDF Vectorizer parameter map for Grid Search
tvec_params = {
    'tvec__max_features': [350, 500, 650],
    'tvec__min_df': [1, 2, 3],
    'tvec__max_df': [.6,.7],
    'tvec__ngram_range': [(1,1),(1,2),(1,3)],
    'tvec__stop_words': [None, 'english'],
    'tvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_tvec = GridSearchCV(pipe_tvec, tvec_params, cv=3)
gs_tvec.fit(X_train, y_train)









































































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [8]:
print(gs_tvec.best_params_)
print(f"Training accuracy: {gs_tvec.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_tvec.score(X_test, y_test)}")

{'tvec__lowercase': False, 'tvec__max_df': 0.6, 'tvec__max_features': 500, 'tvec__min_df': 1, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}
Training accuracy: 0.5951111111111111
Testing accuracy: 0.49066666666666664


### Statistical inference
Grid Search outputs optimal model configuration (TF-IDF vectorizer; gs_tvec.best_params_), but in order to examine the confusion matrix or assess feature coefficients, we must instantiate the model with configurations output from Grid Search.

#### Examining confusion matrix
Using confusion matrix to calculate accuracy, misclassification, sensitivity, and precision.
#### Assessing feature coefficients for multiclass classification using article titles
Comparing correlation of various features with spectrum of source bias.

In [9]:
tvec = TfidfVectorizer(stop_words=None, 
                       max_df=0.6,
                       min_df=1, 
                       ngram_range=(1,1),
                       lowercase=False,
                       max_features=500)

X_train_vec = tvec.fit_transform(X_train)
X_test_vec = tvec.transform(X_test)

lr = LogisticRegression()
model = lr.fit(X_train_vec, y_train)



In [10]:
cm = pd.DataFrame(data=confusion_matrix(y_test, model.predict(X_test_vec)),
                  index=['Actual Establishment News',
                         'Actual Rightwing News',
                         'Actual Leftwing News',
                         'Actual Fake News'],
                  columns=['Predicted Establishment News',
                           'Predicted Rightwing News',
                           'Predicted Leftwing News',
                           'Predicted Fake News'])

sensitivity = recall_score(y_test, model.predict(X_test_vec), average='macro')
precision = precision_score(y_test, model.predict(X_test_vec), average='macro')

print(f'Accuracy score: {round(model.score(X_test_vec, y_test),3)}')
print(f'Misclassification rate: {round(1-model.score(X_test_vec, y_test), 3)}')
print(f'Sensitivity score: {round(sensitivity,3)}')
print(f'Precision score: {round(precision,3)}')
cm

Accuracy score: 0.491
Misclassification rate: 0.509
Sensitivity score: 0.49
Precision score: 0.489


Unnamed: 0,Predicted Establishment News,Predicted Rightwing News,Predicted Leftwing News,Predicted Fake News
Actual Establishment News,205,82,73,43
Actual Rightwing News,63,192,51,60
Actual Leftwing News,82,82,138,64
Actual Fake News,47,61,56,201


In [12]:
coef = pd.DataFrame(model.coef_, columns=tvec.get_feature_names()).T
print('Terms highly and positively correlated with establishment sources:')
print(coef.sort_values(by=0, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with establishment sources:')
print(coef.sort_values(by=0, ascending=True).head(10))
print()
print('Terms highly and positively correlated with rightwing sources:')
print(coef.sort_values(by=1, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with rightwing sources:')
print(coef.sort_values(by=1, ascending=True).head(10))
print()
print('Terms highly and positively correlated with leftwing sources:')
print(coef.sort_values(by=2, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with leftwing sources:')
print(coef.sort_values(by=2, ascending=True).head(10))
print()
print('Terms highly and positively correlated with fake sources:')
print(coef.sort_values(by=3, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with fake sources:')
print(coef.sort_values(by=3, ascending=True).head(10))

Terms highly and positively correlated with establishment sources:
                 0         1         2         3
Briefing  2.456804 -0.847440 -0.999112 -1.088706
his       2.247511 -1.767361 -0.797735 -0.560777
Facts     1.949528 -0.576758 -0.962258 -0.766639
that      1.687387 -1.053627 -0.369434 -0.713466
new       1.641440 -0.917555 -0.715055 -0.307351
most      1.529170 -0.510981 -0.770117 -0.423122
were      1.484220 -0.498057 -0.337276 -0.770196
China     1.437676 -0.222550 -0.403622 -0.909795
won       1.427950 -0.446691 -0.431155 -0.763770
during    1.378004 -0.484009 -0.762769 -0.259852

Terms highly and negatively correlated with establishment sources:
                0         1         2         3
To      -2.684965 -2.281914  1.625270  1.376339
Of      -2.367571 -1.681868  0.456440  1.785355
Hillary -2.003110  0.198835 -2.225123  2.507255
Will    -1.776914  1.054170 -0.706598  1.008542
Not     -1.775873  1.448655 -0.013106 -0.143004
Comment -1.742990 -1.658121 -0.816506 

# Multiclass classification using article bodies
### (Establishment vs. Rightwing vs. Leftwing vs. Fake)
- Grid Search to optimize hyperparameters
- Instantiate model for statistical inference (assessing feature coefficients)

In [13]:
# Reading in title DataFrame
body_df = pd.read_csv('./data/prediction_data/body_data.csv')

# Renaming '0' column to 'title'
body_df.rename(columns={'0':'body', 'label':'label'}, inplace=True)

print(body_df.shape)
body_df.head()

(600, 2)


Unnamed: 0,body,label
0,"People of all ages gather in Krasnoyarsk,...",1
1,The latest evacuations from eastern Aleppo re...,1
2,"Until the Trump presidency became a reality, t...",1
3,Here are the top 10 comments of the week on ou...,1
4,Donald Trump’s nominee to lead the ...,1


In [14]:
# Using bodies for multiclass classification

# Setting up X and y
X = body_df['body']
y = body_df['label']

# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# Establishing Count Vectorizer parameter map for Grid Search
cvec_params = {
    'cvec__max_features': [350, 500, 650],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [.6, .7],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__stop_words': [None, 'english'],
    'cvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_cvec = GridSearchCV(pipe_cvec, cvec_params, cv=3)
gs_cvec.fit(X_train, y_train)











































































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [16]:
print(gs_cvec.best_params_)
print(f"Training accuracy: {gs_cvec.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_cvec.score(X_test, y_test)}")

{'cvec__lowercase': False, 'cvec__max_df': 0.7, 'cvec__max_features': 500, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}
Training accuracy: 0.9977777777777778
Testing accuracy: 0.52


### Grid Search using TF-IDF Vectorizer for multiclass classification

In [17]:
# Establishing TF-IDF Vectorizer parameter map for Grid Search
tvec_params = {
    'tvec__max_features': [350, 500, 650],
    'tvec__min_df': [1, 2, 3],
    'tvec__max_df': [.6,.7],
    'tvec__ngram_range': [(1,1),(1,2),(1,3)],
    'tvec__stop_words': [None, 'english'],
    'tvec__lowercase': [True, False]
}

# Conducting Grid Search
gs_tvec = GridSearchCV(pipe_tvec, tvec_params, cv=3)
gs_tvec.fit(X_train, y_train)











































































GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [18]:
print(gs_tvec.best_params_)
print(f"Training accuracy: {gs_tvec.score(X_train, y_train)}")
print(f"Testing accuracy: {gs_tvec.score(X_test, y_test)}")

{'tvec__lowercase': False, 'tvec__max_df': 0.7, 'tvec__max_features': 500, 'tvec__min_df': 1, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}
Training accuracy: 0.8355555555555556
Testing accuracy: 0.5133333333333333


### Statistical inference
Grid Search outputs optimal model configuration (Count Vectorizer; gs_cvec.best_params_), but in order to examine the confusion matrix or assess feature coefficients, we must instantiate the Logistic Regression model with configurations output from Grid Search.

#### Examining confusion matrix
Using confusion matrix to calculate accuracy, misclassification, sensitivity, and precision.
#### Assessing feature coefficients for multiclass classification using article titles
Comparing correlation of various features with spectrum of source bias.

In [22]:
tvec = TfidfVectorizer(stop_words=None, 
                       max_df=0.7,
                       min_df=1, 
                       ngram_range=(1,1),
                       lowercase=False,
                       max_features=500)

X_train_vec = tvec.fit_transform(X_train)
X_test_vec = tvec.transform(X_test)

lr = LogisticRegression()
model = lr.fit(X_train_vec, y_train)



In [23]:
cm = pd.DataFrame(data=confusion_matrix(y_test, model.predict(X_test_vec)),
                  index=['Actual Establishment News',
                         'Actual Rightwing News',
                         'Actual Leftwing News',
                         'Actual Fake News'],
                  columns=['Predicted Establishment News',
                           'Predicted Rightwing News',
                           'Predicted Leftwing News',
                           'Predicted Fake News'])

sensitivity = recall_score(y_test, model.predict(X_test_vec), average='macro')
precision = precision_score(y_test, model.predict(X_test_vec), average='macro')

print(f'Accuracy score: {round(model.score(X_test_vec, y_test),3)}')
print(f'Misclassification rate: {round(1-model.score(X_test_vec, y_test), 3)}')
print(f'Sensitivity score: {round(sensitivity,3)}')
print(f'Precision score: {round(precision,3)}')
cm

Accuracy score: 0.513
Misclassification rate: 0.487
Sensitivity score: 0.524
Precision score: 0.533


Unnamed: 0,Predicted Establishment News,Predicted Rightwing News,Predicted Leftwing News,Predicted Fake News
Actual Establishment News,18,4,15,8
Actual Rightwing News,9,17,8,3
Actual Leftwing News,4,4,20,4
Actual Fake News,4,2,8,22


In [21]:
coef = pd.DataFrame(model.coef_, columns=tvec.get_feature_names()).T
print('Terms highly and positively correlated with establishment sources:')
print(coef.sort_values(by=0, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with establishment sources:')
print(coef.sort_values(by=0, ascending=True).head(10))
print()
print('Terms highly and positively correlated with rightwing sources:')
print(coef.sort_values(by=1, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with rightwing sources:')
print(coef.sort_values(by=1, ascending=True).head(10))
print()
print('Terms highly and positively correlated with leftwing sources:')
print(coef.sort_values(by=2, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with leftwing sources:')
print(coef.sort_values(by=2, ascending=True).head(10))
print()
print('Terms highly and positively correlated with fake sources:')
print(coef.sort_values(by=3, ascending=False).head(10))
print()
print('Terms highly and negatively correlated with fake sources:')
print(coef.sort_values(by=3, ascending=True).head(10))

Terms highly and positively correlated with establishment sources:
                    0         1         2         3
Mr           2.818486 -0.808498 -1.582743 -1.045474
Monday       1.066527 -0.037413 -0.312851 -0.775528
Wednesday    1.011014 -0.437375 -0.287422 -0.353285
according    0.996735 -0.343435 -0.183699 -0.499885
Dr           0.993322 -0.219310 -0.425396 -0.387348
May          0.955830 -0.338014  0.116834 -0.754777
officials    0.902609 -0.647535  0.356128 -0.681518
Republicans  0.840304 -0.340291 -0.208001 -0.299483
would        0.817233 -0.759517  0.690306 -0.801177
Senate       0.815659 -0.531486  0.472701 -0.796501

Terms highly and negatively correlated with establishment sources:
                  0         1         2         3
US        -1.203426 -1.588812  0.518421  1.808371
2016      -0.920565 -0.384208 -0.917088  1.794896
News      -0.901620  1.574641 -0.705356 -0.146932
Breitbart -0.887800  2.624485 -1.049094 -0.966973
2017      -0.687933  0.829718  0.176078 -0.