## Fake News Classifer

**Project Link**: https://www.kaggle.com/c/fake-news/overview

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width: 100% !important; }</style>"))
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pylab import rcParams
rcParams['figure.figsize'] = 22, 7

import pandas as pd 
import numpy as np

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

import pickle
from datetime import datetime
#from joblib import Memory
import gzip

In [2]:
df = pd.read_csv('./data/train.csv')
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [3]:
df = df.dropna().reset_index()
y = df['label']
messages = df.copy()
messages.shape, df.shape

((18285, 6), (18285, 6))

In [4]:
#memory = Memory(location='cachedir')
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__ (self):
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        corpus = []
        for i in range(len(X)):
            text = re.sub(r"[^a-zA-Z]", ' ', X['title'][i])
            text = text.lower().split()
            text = [PorterStemmer().stem(word) for word in text if word not in stopwords.words('english')]
            text = ' '.join(text)
            corpus.append(text)
        return corpus

In [5]:
corpus = Preprocessor().fit_transform(messages) 

In [6]:
corpus[0:4]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi']

In [7]:
y = messages['label']
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
X = cv.fit_transform(corpus).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Naive Bayes Classifier

In [108]:
classifier=MultinomialNB()
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      2040
           1       0.87      0.91      0.89      1617

    accuracy                           0.90      3657
   macro avg       0.90      0.90      0.90      3657
weighted avg       0.90      0.90      0.90      3657



## Passive Aggressive Classifer

In [96]:
classifier=PassiveAggressiveClassifier()
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2040
           1       0.90      0.93      0.91      1617

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657



## Logistic Regression Classifier

In [8]:
classifier=LogisticRegression()
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94      2040
           1       0.89      0.97      0.93      1617

    accuracy                           0.94      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.94      0.94      3657



## Multinomial Classifier with Hyperparameter Tuning

In [105]:
clf_cv = GridSearchCV(MultinomialNB(), param_grid={'alpha': np.arange(0,1,0.1)}, cv=5, verbose=2).fit(X_train, y_train)
clf_cv

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] END ..........................................alpha=0.0; total time=   0.9s


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] END ..........................................alpha=0.0; total time=   0.9s


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] END ..........................................alpha=0.0; total time=   0.9s


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] END ..........................................alpha=0.0; total time=   0.9s


  'setting alpha = %.1e' % _ALPHA_MIN)


[CV] END ..........................................alpha=0.0; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.1; total time=   0.9s
[CV] END ..........................................alpha=0.2; total time=   0.9s
[CV] END ..........................................alpha=0.2; total time=   0.9s
[CV] END ..........................................alpha=0.2; total time=   0.9s
[CV] END ..........................................alpha=0.2; total time=   0.9s
[CV] END ..........................................alpha=0.2; total time=   0.9s
[CV] END ..........................alpha=0.30000000000000004; total time=   0.9s
[CV] END ...................

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             verbose=2)

In [106]:
clf_cv.best_estimator_, clf_cv.best_params_, clf_cv.best_score_

(MultinomialNB(alpha=0.8), {'alpha': 0.8}, 0.8984831776410724)

## Logistic Regression Hyperparameter tuning

In [111]:
%%time 
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
lr_cv = GridSearchCV(LogisticRegression(), param_grid= grid_values, cv=5, verbose=2).fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ................................C=0.001, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ................................C=0.001, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ................................C=0.001, penalty=l1; total time=   0.1s
[CV] END ................................C=0.001, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.001, penalty=l2; total time=   0.7s
[CV] END ................................C=0.009, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ................................C=0.009, penalty=l1; total time=   0.1s
[CV] END ................................C=0.009, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ................................C=0.009, penalty=l1; total time=   0.1s
[CV] END ................................C=0.009, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END ................................C=0.009, penalty=l2; total time=   0.9s
[CV] END ................................C=0.009, penalty=l2; total time=   1.0s
[CV] END ................................C=0.009, penalty=l2; total time=   1.0s
[CV] END ................................C=0.009, penalty=l2; total time=   1.0s
[CV] END ................................C=0.009, penalty=l2; total time=   0.9s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END .................................C=0.01, penalty=l2; total time=   0.9s
[CV] END .................................C=0.01, penalty=l2; total time=   1.1s
[CV] END .................................C=0.01, penalty=l2; total time=   1.0s
[CV] END .................................C=0.01, penalty=l2; total time=   1.0s
[CV] END .................................C=0.01, penalty=l2; total time=   1.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END .................................C=0.09, penalty=l1; total time=   0.1s
[CV] END .................................C=0.09, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END .................................C=0.09, penalty=l1; total time=   0.0s
[CV] END .................................C=0.09, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END .................................C=0.09, penalty=l1; total time=   0.1s
[CV] END .................................C=0.09, penalty=l2; total time=   1.5s
[CV] END .................................C=0.09, penalty=l2; total time=   1.5s
[CV] END .................................C=0.09, penalty=l2; total time=   1.5s
[CV] END .................................C=0.09, penalty=l2; total time=   1.6s
[CV] END .................................C=0.09, penalty=l2; total time=   1.6s
[CV] END ....................................C=1, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ....................................C=1, penalty=l1; total time=   0.1s
[CV] END ....................................C=1, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ....................................C=1, penalty=l1; total time=   0.1s
[CV] END ....................................C=1, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END ....................................C=1, penalty=l2; total time=   2.7s
[CV] END ....................................C=1, penalty=l2; total time=   3.0s
[CV] END ....................................C=1, penalty=l2; total time=   2.9s
[CV] END ....................................C=1, penalty=l2; total time=   3.0s
[CV] END ....................................C=1, penalty=l2; total time=   3.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ....................................C=5, penalty=l1; total time=   0.1s
[CV] END ....................................C=5, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ....................................C=5, penalty=l1; total time=   0.1s
[CV] END ....................................C=5, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END ....................................C=5, penalty=l1; total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ....................................C=5, penalty=l2; total time=   4.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ....................................C=5, penalty=l2; total time=   4.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ....................................C=5, penalty=l2; total time=   4.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ....................................C=5, penalty=l2; total time=   4.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ....................................C=5, penalty=l2; total time=   4.1s
[CV] END ...................................C=10, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ...................................C=10, penalty=l1; total time=   0.1s
[CV] END ...................................C=10, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ...................................C=10, penalty=l1; total time=   0.1s
[CV] END ...................................C=10, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regre

[CV] END ...................................C=10, penalty=l2; total time=   4.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=10, penalty=l2; total time=   4.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=10, penalty=l2; total time=   4.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=10, penalty=l2; total time=   4.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=10, penalty=l2; total time=   4.6s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ...................................C=25, penalty=l1; total time=   0.1s
[CV] END ...................................C=25, penalty=l1; total time=   0.1s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] END ...................................C=25, penalty=l1; total time=   0.0s


Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\92304\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

[CV] END ...................................C=25, penalty=l1; total time=   0.1s
[CV] END ...................................C=25, penalty=l1; total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=25, penalty=l2; total time=   4.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=25, penalty=l2; total time=   4.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=25, penalty=l2; total time=   4.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] END ...................................C=25, penalty=l2; total time=   4.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
        nan 0.91892307        nan 0.93204849        nan 0.92739978
        nan 0.92589581        nan 0.92316142]


[CV] END ...................................C=25, penalty=l2; total time=   4.3s
Wall time: 1min 59s


In [112]:
lr_cv.best_estimator_, lr_cv.best_score_

(LogisticRegression(C=1), 0.9320484895221737)

In [117]:
## Feature Names
feature_names = cv.get_feature_names()
feature_names[0:5], classifier.coef_[0][0:5]

(['abandon', 'abc', 'abc news', 'abduct', 'abe'],
 array([-0.33735634, -0.60565833, -0.22711495,  0.16059164, -0.27003038]))

In [121]:
#Getting the most real words
sorted(zip(classifier.coef_[0],feature_names), reverse=True)[0:20]

[(2.4782987610605876, 'comment'),
 (2.4166088004594903, 'hillari'),
 (1.627761122785367, 'imperi'),
 (1.6020719357214086, 'invis'),
 (1.5590711541704865, 'aleppo'),
 (1.4965925672048574, 'trump win'),
 (1.4640188922447002, 'subject'),
 (1.417285276530241, 'video'),
 (1.4162776100538461, 'muslim migrant'),
 (1.3842684598827577, 'clinton'),
 (1.3763318743472472, 'invad'),
 (1.295362244506122, 'us'),
 (1.2658612396752214, 'bill clinton'),
 (1.2324758630772117, 'stand rock'),
 (1.2286901755517763, 'break'),
 (1.2188033419638922, 'trump support'),
 (1.2026380119607867, 'homeless'),
 (1.1872352557423707, 'meddl'),
 (1.1745839552086768, 'duke'),
 (1.145339425513801, 'halloween')]

In [122]:
# Most Fake words
sorted(zip(classifier.coef_[0], feature_names), reverse=False)[:20]

[(-7.085126812289049, 'breitbart'),
 (-3.214056606598164, 'delingpol'),
 (-2.916017350358032, 'new york time'),
 (-2.916017350358032, 'york time'),
 (-2.663507362270221, 'hillari clinton'),
 (-2.184377852818593, 'cartel'),
 (-2.117084133893998, 'pope'),
 (-2.0870544456577664, 'migrant'),
 (-2.028932007436494, 'sanctuari'),
 (-2.018433206167187, 'virgil'),
 (-2.0058086935336954, 'inaugur'),
 (-1.9806705719220694, 'london'),
 (-1.9734763926803263, 'texa'),
 (-1.9528246933125304, 'islamist'),
 (-1.9353495981234055, 'macron'),
 (-1.819393252120621, 'ms'),
 (-1.7360061610054525, 'gorka'),
 (-1.7219435932650689, 'cent'),
 (-1.721083861642792, 'bill'),
 (-1.6821836756224455, 'border')]

## Making Submission with Logistic Regression

In [9]:
## will have to deal with the nans for submission
submission = pd.read_csv('./data/submit.csv')
submission

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1
...,...,...
5195,25995,0
5196,25996,1
5197,25997,0
5198,25998,1


In [10]:
test = pd.read_csv('./data/test.csv')
test

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [18]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18285 entries, 0 to 18284
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   18285 non-null  int64 
 1   id      18285 non-null  int64 
 2   title   18285 non-null  object
 3   author  18285 non-null  object
 4   text    18285 non-null  object
 5   label   18285 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 857.2+ KB


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [23]:
def transform_test(X):
    corpus = []
    for i in range(len(X)):
        text = re.sub("[^a-zA-Z]", ' ', X['title'][i])
        text = text.lower().split()
        text = [PorterStemmer().stem(word) for word in text if word not in stopwords.words('english')]
        text = ' '.join(text)
        corpus.append(text)
    return corpus

In [30]:
test.head(10)

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...
8,20808,Urban Population Booms Will Make Climate Chang...,,Urban Population Booms Will Make Climate Chang...
9,20809,,cognitive dissident,don't we have the receipt?


In [28]:
for i in range(len(test)):
    print(i)
    text = re.sub("[^a-zA-Z]", ' ', test['title'][i])
#cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
#X = cv.fit_transform(corpus).toarray()

0
1
2
3
4
5
6
7
8
9


TypeError: expected string or bytes-like object

In [13]:
## First apply the transformation to the test data as well then make the predictions
submission = classifier.predict(test)
filetime = datetime.now().strftime("%Y-%m-%d-%H-%m")
submission = pd.DataFrame({'id': test.index, 'label': submission})
submission.to_csv('Submissions/Submission'+filetime+'.csv')

ValueError: could not convert string to float: 'Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'