# Loading Data / Preprocessing / Train-Test Split

In [1]:
import pandas as pd
import spacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from spacy.tokenizer import Tokenizer

nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)
                                                    
df = pd.read_csv('data/datasets/cleaned_data.csv')

In [2]:
# Extending stop words relative to our use case (technical support subreddits)
STOP_WORDS = nlp.Defaults.stop_words.union(["doesnt", "wont", "cant"])

In [3]:
# Tokenizing

tokens = []

for doc in tokenizer.pipe(df['Text'].astype('unicode')):
    
    doc_tokens = []
    
    for token in doc:
        if (token.text.lower() not in STOP_WORDS) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
        
df['Tokens'] = tokens

# Models

## Random Forest Classifier

### Baseline Model

In [5]:
vect = TfidfVectorizer(stop_words=STOP_WORDS)
rfc = RandomForestClassifier()

rfc_pipe = Pipeline([('vect', vect), ('clf', rfc)])

rfc_pipe.fit(df['Text'], df['Subreddit'])

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
rfc_baseline = rfc_pipe.evaluate(X_test, y_test)

### Hyperparameter Tuning

#### **tf-idf Vectorizer**

- #### ngram_range

In [None]:
parameters = {'vect__ngram_range': [(1,2), (1,3), (1,4), (1,5), (2, 4), (2,5)]}

grid_search = GridSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, verbose=False)
results = grid_search.fit(X_train, y_train, validation_split=0.2)

print('Best score:', results.best_score_)
print('Achieved With:', results.best_params_)

vect__ngram_range = results.best_params_['vect__ngram_range']

- #### min_df and max_df

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__min_df': [0.01, 0.1, 0.25, 0.5, 0.75, 1],
              'vect__max_df': [0.01, 0.1, 0.25, 0.5, 0.75, 1]}

grid_search = GridSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, verbose=False)
results = grid_search.fit(X_train, y_train, validation_split=0.2)

print('Best score:', results.best_score_)
print('Achieved With:', results.best_params_)

vect__max_df = results.best_params_['vect__max_df']
vect__min_df = results.best_params_['vect__min_df']

- #### max_features

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__min_df': [vect__min_df],
              'vect__max_df': [vect__max_df],
              'vect__max_features': [None, 5, 10, 15, 25, 50, 100, 250]}

grid_search = GridSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, verbose=False)
results = grid_search.fit(X_train, y_train, validation_split=0.2)

print('Best score:', results.best_score_)
print('Achieved With:', results.best_params_)

vect__max_features = results.best_params_['vect__max_features']

#### **Random Forest Classifier**

- #### n_estimators and criterion

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__min_df': [vect__min_df],
              'vect__max_df': [vect__max_df],
              'vect__max_features': [vect__max_features],
              'rfc__n_estimators': [10, 25, 50, 75, 100, 150, 250, 500],
              'rfc__criterion': ['gini', 'entropy']

random_search = RandomizedSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, n_iter=20, verbose=False)
results = random_search.fit(X_train, y_train, validation_split=0.2, shuffle=True)

print('Best score:', results.best_score_)
print('Achieved With:', results.best_params_)
              
rfc__criterion = results.best_params_['rfc__criterion']
rfc__n_estimators = results.best_params_['rfc__n_estimators']

- #### min_samples_split and min_samples_leaf

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__min_df': [vect__min_df],
              'vect__max_df': [vect__max_df],
              'vect__max_features': [vect__max_features],
              'rfc__criterion': [rfc__criterion],
              'rfc__n_estimators': [rfc__n_estimators],
              'rfc__min_samples_split': [2, 5, 10, 15, 20, 25],
              'rfc__min_samples_leaf': [1, 3, 5, 7, 9, 12, 15]}

random_search = RandomizedSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, n_iter=15, verbose=False)
results = random_search.fit(X_train, y_train, validation_split=0.2, shuffle=True)

print('Best score:', results.best_score_)
print('Achieved With:', results.best_params_)

rfc__min_samples_leaf = results.best_params_['rfc__min_samples_leaf']
rfc__min_samples_split = results.best_params_['rfc__min_samples_split']

- #### max_features, max_leaf_nodes, and min_impurity_decrease

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__min_df': [vect__min_df],
              'vect__max_df': [vect__max_df],
              'vect__max_features': [vect__max_features],
              'rfc__criterion': [rfc__criterion],
              'rfc__min_samples_split': [rfc__min_samples_split],
              'rfc__min_samples_leaf': [rfc__min_samples_leaf],
              'rfc__max_features': [None, 5, 10, 25, 50, 'sqrt', 'log2'],
              'rfc__max_leaf_nodes': [None, 2, 5, 10, 25, 50, 100],
              'rfc__min_impurity_decrease': [0.0, 0.01, 0.025, 0.1, 0.25]}

random_search = RandomizedSearchCV(rfc_pipe, parameters, cv=4, n_jobs=-1, n_iter=15, verbose=False)
results = random_search.fit(X_train, y_train, validation_split=0.2, shuffle=True)

print('Best score (Train):', results.best_score_)
print('Achieved With:', results.best_params_)

rfc__max_features = results.best_params_['rfc__max_features']
rfc__max_leaf_nodes = results.best_params_['rfc__max_leaf_nodes']
rfc__min_impurity_decrease = results.best_params_['rfc__min_impurity_decrease']

In [None]:
# Printing evaluation of model
print('Train:', results.best_score_)
print('Test:', random_search.score(X_test, y_test))
print('Achieved with the following hyperparameters:')
print(results.best_params_)