In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split,StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('CrimeVsNoCrimeArticles.csv')
df.head()

In [None]:
df = df.dropna(subset = ['title'])

x = df['title']
y = df['is_crime_report']

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = .2, random_state = 42)

print(f'the length of training data is{len(x_train)}')
print(f'the length of testing data is {len(x_test)}')

In [None]:
best_estimators = [
    ("nb", ComplementNB()),
    ("dt", DecisionTreeClassifier()),
]
final_estimator = LogisticRegression()

In [None]:
model = StackingClassifier(
    estimators = 'best_estimators',
    final_estimator= 'final_estimator',
    cv = 5,
    n_jobs =1,
    passthrough= True
)
param_grid = {
    'Vectorizer__max_features' :[5000,8000],
    'Vectorizer__ngram_range' : [(1,1),(1,2)],
    'Vectorizer__subliner_tf' : [True,False]
    
}

vectorizer = TfidfVectorizer(stop_words = 'english')

pipeline = Pipeline([
    ('vectorizer', vectorizer), 
    ('model', model)
])
cv = StratifiedKFold(n_splits  = 5, shuffle = True, random_state = 42)

grid_search = GridSearchCV(pipeline, param_grid, cv = cv, verbose = 1, n_jobs = 1)
grid_search.fit(x_train, y_train)

print(f'best parameters found{grid_search.best_params_}')
print(f'best score {grid_search.best_score_}')


In [None]:
y_pred = grid_search.pred(x_test)

accuracy = accuracy_score(y_test,y_pred)
print(f'Improved accuracy with hyperparameter tuning: {accuracy*100:.2f}%')

print('classification_report')
print(classification_report(y_test,y_pred))

In [None]:
import pickle

with open('stacking_model.pkl', 'wb') as f:
    pickle.dump(grid_search, f)