In [1]:
import pandas as pd
import numpy as np

import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/labeled_clean.csv')

In [3]:
# Transform text to vector
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
)

X = vectorizer.fit_transform(df.comment)
y = df.toxic.to_numpy()

In [5]:
# Initiate params for random search
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Search for best params
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 3, 
    scoring='roc_auc', 
    verbose=10, 
    random_state=42, 
    n_jobs = -1
    )

rf_random.fit(X, y)

In [23]:
params = rf_random.best_params_
params

{'n_estimators': 1800,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [4]:
# Fit the model
rf_ = RandomForestClassifier(
    n_estimators=1800, min_samples_split=2, 
    min_samples_leaf=2,
    max_depth=None, bootstrap=True)

rf_.fit(X,y)

In [5]:
# Save vectorizer and model
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))
pickle.dump(rf_, open('model.pkl', 'wb'))