In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import pickle
from scipy.sparse import csr_matrix

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import TransformerMixin
from sklearn.base import BaseEstimator

class Stemmer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.port_stem = PorterStemmer()
        nltk.download('stopwords')
        nltk.download('punkt')

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        stemmed_X = []

        for content in X:
            stemmed_content = re.sub('[^a-zA-Z]',' ',content)
            stemmed_content = stemmed_content.lower()
            stemmed_content = stemmed_content.split()
            stemmed_content = [self.port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
            stemmed_content = ' '.join(stemmed_content)
            stemmed_X.append(stemmed_content)
        
        return np.asarray(stemmed_X).astype('U')
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def loadModels():
    with open("./model/model.pkl", "rb") as f:
        model = pickle.load(f)

    with open("./model/vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    return model, vectorizer
model, vectorizer = loadModels()

In [6]:
vectorizer

In [4]:
test = pd.read_csv("./dataset/test.csv")
submit = pd.read_csv("./dataset/submit.csv")
test = test.merge(submit, left_on="id", right_on="id")

test['content'] = test['title'] + " " + test['text']
test

Unnamed: 0,id,title,author,text,label,content
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0,"Specter of Trump Loosens Tongues, if Not Purse..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0,#NoDAPL: Native American Leaders Vow to Stay A...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1,"Tim Tebow Will Attempt Another Comeback, This ..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1,Keiser Report: Meme Wars (E995) 42 mins ago 1 ...
...,...,...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,0,The Bangladeshi Traffic Jam That Never Ends - ...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,1,John Kasich Signs One Abortion Bill in Ohio bu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,0,"California Today: What, Exactly, Is in Your Su..."
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,1,300 US Marines To Be Deployed To Russian Borde...


In [10]:
test.iloc[1]['content'].replace('"', "'")

"Russian warships ready to strike terrorists near Aleppo Russian warships ready to strike terrorists near Aleppo 08.11.2016 | Source: Source: Mil.ru Attack aircraft of the Russian aircraft carrier Admiral Kuznetsov get ready to strike terrorists' positions in the vicinity of Aleppo, sources at the Russian Defense Ministry said, RBC reports. 'Insurgents' attempts to break into Aleppo from outside are meaningless,' the source said. The main task of the aircraft carrier aviation group is to strike missile and air blows on the terrorists , whose goal is to enter Aleppo. 'After the attacks on terrorists' positions, one will have to forget about the support for insurgents from the outside,' the source said. The Russian group in the Mediterranean Sea consists of the Admiral Kuznetsov aircraft carrier , the heavy nuclear missile cruiser Pyotr Velikiy (Peter the Great) and large anti-submarine ships Severomorsk and Vice-Admiral Kulakov. Russia has increased intelligence activities in Syria to e

In [31]:
# test.isna().sum()
test.dropna(subset=['text', 'content'], axis = 0, inplace=True)
test.isna().sum()

id           0
title        0
author     496
text         0
label        0
content      0
dtype: int64

In [32]:
def loadModels(modelDir, vectorizerDir):
    with open(modelDir, "rb") as f:
        model = pickle.load(f)

    with open(vectorizerDir, "rb") as f:
        vectorizer = pickle.load(f)
    return model, vectorizer

def predictLogistic(inputs : list[list]):
    model, vectorizer = loadModels( "./model/model.pkl", 
        "./model/vectorizer.pkl")

    stemmed_data = Stemmer().fit_transform(inputs)
    X = vectorizer.transform(stemmed_data.astype('U'))
    
    return model.predict(X)


In [44]:
dfRows = np.random.choice(test['id'], 100, replace=False) 
dfRows

array([24678, 22805, 21070, 22844, 24642, 21508, 24086, 22482, 21617,
       24144, 21789, 22169, 25899, 23913, 24029, 23113, 22932, 22936,
       22148, 23366, 23300, 25150, 21159, 25768, 22470, 22748, 21152,
       23032, 25814, 20990, 24508, 24065, 25476, 25765, 21580, 25199,
       21595, 24389, 21923, 23079, 23074, 23758, 25612, 20830, 22700,
       25453, 23107, 21048, 24170, 23709, 23819, 25712, 21176, 23053,
       24491, 24708, 21182, 21328, 23137, 21378, 22471, 22521, 24514,
       25089, 22324, 25016, 24095, 22531, 22162, 20932, 22391, 25350,
       23115, 21249, 23912, 25613, 23794, 21369, 22402, 21269, 21773,
       25084, 22599, 21770, 21129, 23469, 24562, 21194, 24325, 25807,
       21854, 21992, 23029, 23871, 22268, 25726, 25780, 23211, 23665,
       22616], dtype=int64)

In [48]:
remDf = test.query("id in @dfRows")
input = test['content'].values
target = test[dfRows]['label'].values

KeyError: "None of [Int64Index([24678, 22805, 21070, 22844, 24642, 21508, 24086, 22482, 21617,\n            24144, 21789, 22169, 25899, 23913, 24029, 23113, 22932, 22936,\n            22148, 23366, 23300, 25150, 21159, 25768, 22470, 22748, 21152,\n            23032, 25814, 20990, 24508, 24065, 25476, 25765, 21580, 25199,\n            21595, 24389, 21923, 23079, 23074, 23758, 25612, 20830, 22700,\n            25453, 23107, 21048, 24170, 23709, 23819, 25712, 21176, 23053,\n            24491, 24708, 21182, 21328, 23137, 21378, 22471, 22521, 24514,\n            25089, 22324, 25016, 24095, 22531, 22162, 20932, 22391, 25350,\n            23115, 21249, 23912, 25613, 23794, 21369, 22402, 21269, 21773,\n            25084, 22599, 21770, 21129, 23469, 24562, 21194, 24325, 25807,\n            21854, 21992, 23029, 23871, 22268, 25726, 25780, 23211, 23665,\n            22616],\n           dtype='int64')] are in the [columns]"

In [47]:
target.shape

(5071,)

In [43]:
preds = predictLogistic(input)
preds

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 

In [None]:
print("Accuracy on Test set : ", sum(preds == target) / len(preds))