In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.testing import all_estimators

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, CategoricalNB, ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
import joblib
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from statistics import mean

def get_scores(X):
    conf_scores = vote_model.transform(X)

    scores = []
    for s in conf_scores:
        all_classes = []
        for i in range(int(len(s)/2)):
            all_classes.append(s[2*i+1])
        score = mean(all_classes)*4-2
        if score < 0:
            if score > -0.5:
                scores.append(['left', 0])
            elif score > -1.1:
                scores.append(['left', 0.3])
            else:
                scores.append(['left', 1])
        else:
            if score < 0.5:
                scores.append(['right', 0])
            elif score < 1.1:
                scores.append(['right', 0.3])
            else:
                scores.append(['right', 1])
    return scores


articles = pd.read_csv('binary_bias.csv')

#randomizes rows
articles = articles.sample(frac=1)

stemmer = SnowballStemmer('english')
words = stopwords.words("english")

articles['cleaned'] = articles['article'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

X_train, X_test, y_train, y_test = train_test_split(articles['cleaned'], articles['bias'], test_size=0.5)

### CLASSIFIER ###

estimator = []
# estimator.append(('RC', CategoricalNB()))
# estimator.append(('LSVC', ComplementNB()))
estimator.append(('MNB', MultinomialNB()))
estimator.append(('BNB', BernoulliNB()))
estimator.append(('LR', LogisticRegression()))
estimator.append(('SGD', SVC(probability=True)))

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english")),
                     ('best',  SelectKBest(k=10000)),
                     ('model', VotingClassifier(estimators = estimator, voting ='soft', flatten_transform=True))])

vote_model = pipeline.fit(X_train, y_train)
joblib.dump(vote_model, 'vote_model.joblib')

print("Accuracy: " + str(round(vote_model.score(X_test, y_test)*100, 2)) + "%")

scores = get_scores(X_test)

binary = []
for s in scores:
    if s[0] == 'left':
        binary.append(0)
    else:
        binary.append(1)

accuracy_score(y_test, binary)

# score = accuracy_score(y_test, y_pred)
# print("Soft Voting Score % d" % score)



Accuracy: 67.79%


0.6779279279279279