In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv('CrimeVsNoCrimeArticles.csv')
df.head()

In [None]:
df = df.dropna(subset = ['title'])
x = df['title']
y = df['is_crime_report']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .2, random_state = 42)

train_data = print(f'training data size: {len(x_train)}')
testing_data = print(f'test data size: {len(x_test)}')

In [None]:
model = DecisionTreeClassifier()
param_grid  = {
    'vectorizer__max_features': [5000, 8000, 10000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__sublinear_tf': [True, False],

    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__criterion': ['gini', 'entropy'], 
}
vectorizer = TfidfVectorizer(stop_words = 'english')
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])
cv = StratifiedKFold(n_splits = 5,  shuffle = True ,random_state = 42)
grid_search = GridSearchCV(pipeline, param_grid , cv = cv, verbose = 1, n_jobs = 1)

grid_search.fit(x_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
y_pred = grid_search.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)

print(f'improved accuracy with hyperparameter tuning{accuracy *100:.2f}%')
print('classification_report:')
print(classification_report(y_test,y_pred))

In [11]:
import pickle
with open('decision_tree.pkl', 'wb') as f:
    pickle.dump(grid_search, f)