# Model Selection

The following are potential models that I've toyed with a little bit in order to make the decision of the best model choice.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split


df = pd.read_csv('../data/datasets/cleaned_data.csv')

df.head()

Unnamed: 0,Subreddit,Tokens
0,24hoursupport,"['like', 'helping', 'come', 'hang', 'new', 'di..."
1,24hoursupport,"['acer', 'laptop', 'corrupt', 'windows', 'turn..."
2,24hoursupport,"['downloaded', 'trojan', 'file', 'window', 'de..."
3,24hoursupport,"['use', 'help', 'accessed', 'smart', 'remote',..."
4,24hoursupport,"['problem', 'network', 'adapter', 'i’m', 'havi..."


In [2]:
# Train-Test Split

train, test = train_test_split(df, train_size=0.7, random_state=42)  # 70/30 Train-Test Split

train.shape, test.shape

((898, 2), (385, 2))

In [3]:
X_train = train['Tokens']
y_train = train['Subreddit']

X_test = test['Tokens']
y_test = test['Subreddit']

print('X:')
print('Train:', X_train.shape)
print('Test:', X_test.shape)

print()  # For improved readability

print('y:')
print('Train:', y_train.shape)
print('Test:', y_test.shape)

X:
Train: (898,)
Test: (385,)

y:
Train: (898,)
Test: (385,)


# TF-IDF Vectorizer / Random Forest Classifier

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Not using Tokenizer parameter because we've already tokenized our input in preprocessing.
vect = TfidfVectorizer()
rfc = RandomForestClassifier()

pipe = Pipeline([('vect', vect), ('clf', rfc)])

pipe.fit(X_train, y_train)

print('Baseline Accuracy Score:', pipe.score(X_test, y_test))

Baseline Accuracy Score: 0.5246753246753246


In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                    (1, 4), (1, 5), (1, 6),
                                    (2, 3), (2, 4), (2, 5)],
              'vect__analyzer': ['word', 'char', 'char_wb'],
              'vect__norm': ['l1', 'l2']}
search = RandomizedSearchCV(pipe, parameters, n_iter=40, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

vect__ngram_range = results.best_params_['vect__ngram_range']
vect__analyzer = results.best_params_['vect__analyzer']
vect__norm = results.best_params_['vect__norm']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation:', search.score(X_test, y_test))

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__analyzer': [vect__analyzer],
              'vect__norm': [vect__norm],
              'clf__n_estimators': [25, 50, 100, 125, 150, 175, 200, 225, 250, 300],
              'clf__max_features': [None, 'sqrt', 'log2', 50, 100, 200, 250],
              'clf__class_weight': ['balanced', 'balanced_subsample'],
              'clf__oob_score': [True, False]}

search = RandomizedSearchCV(pipe, parameters, n_iter=40, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)


clf__n_estimators = results.best_params_['clf__n_estimators']
clf__max_features = results.best_params_['clf__max_features']
clf__class_weight = results.best_params_['clf__class_weight']
clf__oob_score = results.best_params_['clf__oob_score']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation:', search.score(X_test, y_test))

In [None]:
parameters = {'vect__analyzer': [vect__analyzer],
              'vect__ngram_range': [vect__ngram_range],
              'vect__norm': [vect__norm],
              'clf__n_estimators': [clf__n_estimators],
              'clf__max_features': [clf__max_features],
              'clf__class_weight': [clf__class_weight],
              'clf__oob_score': [clf__oob_score],
              'clf__max_depth': [None, 5, 10, 25, 50, 100, 150, 250],
              'clf__min_samples_leaf': [1, 2.5, 5, 10, 25, 50],
              'clf__max_leaf_nodes': [None, 2, 3, 5, 10, 15, 25, 50]}

search = RandomizedSearchCV(pipe, parameters, n_iter=40, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

clf__max_depth = results.best_params_['clf__max_depth']
clf__min_samples_leaf = results.best_params_['clf__min_samples_leaf']
clf__max_leaf_nodes = results.best_params_['clf__max_leaf_nodes']


print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (TF-IDF / RFC):', search.score(X_test, y_test))

# TF-IDF / Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
vect = TfidfVectorizer()

pipe = Pipeline([('vect', vect), ('clf', mnb)])

pipe.fit(X_train, y_train)
print('Baseline model accuracy (TF-IDF / MNB):', pipe.score(X_test, y_test))

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                    (1, 4), (1, 5), (1, 6),
                                    (2, 3), (2, 4), (2, 5),
                                    (2, 6), (3, 3), (4, 4)],
              'vect__analyzer': ['word', 'char', 'char_wb'],
              'vect__norm': ['l1', 'l2'],
              'clf__alpha': [0.0, 0.125, 0.25, 0.5, 0.75, 1.0],
              'clf__fit_prior': [True, False]}
search = RandomizedSearchCV(pipe, parameters, cv=5, n_iter=50, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

vect__ngram_range = results.best_params_['vect__ngram_range']
vect__analyzer = results.best_params_['vect__analyzer']
vect__norm = results.best_params_['vect__norm']
clf__alpha = results.best_params_['clf__alpha']
clf__fit_prior = results.best_params_['clf__fit_prior']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (TF-IDF / MNB):', search.score(X_test, y_test))

# LSI / Random Forest Classifier (TF-IDF)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Instantiate
svd = TruncatedSVD()
rfc = RandomForestClassifier()
vect = TfidfVectorizer()

lsi = Pipeline([('vect', vect), ('svd', svd)])

pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

pipe.fit(X_train, y_train)
print('Baseline model accuracy (TF-IDF LSI / RFC):', pipe.score(X_test, y_test))

In [None]:
parameters = {'lsi__vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                         (1, 4), (1, 5), (1, 6),
                                         (2, 3), (2, 4), (2, 5),
                                         (2, 6), (3, 3), (4, 4)],
              'lsi__vect__analyzer': ['word', 'char', 'char_wb'],
              'lsi__vect__norm': ['l1', 'l2'],
              'clf__criterion': ['gini', 'entropy']}
search = RandomizedSearchCV(pipe, parameters, cv=5, n_iter=50, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

lsi__vect__ngram_range = results.best_params_['lsi__vect__ngram_range']
lsi__vect__analyzer = results.best_params_['lsi__vect__analyzer']
lsi__vect__norm = results.best_params_['lsi__vect__norm']
clf__criterion = results.best_params_['clf__criterion']


print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (TF-IDF LSI / RFC):', search.score(X_test, y_test))

In [None]:
parameters = {'lsi__vect__ngram_range': [lsi__vect__ngram_range],
              'lsi__vect__analyzer': [lsi__vect__analyzer],
              'clf__criterion': [clf__criterion],
              'lsi__vect__norm': [lsi__vect__norm],
              'lsi__svd__n_components': [2, 5, 25, 50, 75, 100, 125, 150],
              'lsi__svd__algorithm': ['arpack', 'randomized'],
              'lsi__svd__n_iter': [5, 25, 50, 75, 100, 125, 150]}

search = RandomizedSearchCV(pipe, parameters, n_iter=35, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)


lsi__svd__n_components = results.best_params_['lsi__svd__n_components']
lsi__svd__algorithm = results.best_params_['lsi__svd__algorithm']
lsi__svd__n_iter = results.best_params_['lsi__svd__n_iter']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (TF-IDF LSI / RFC):', search.score(X_test, y_test))

In [None]:
parameters = {'lsi__vect__ngram_range': [lsi__vect__ngram_range],
              'lsi__vect__analyzer': [lsi__vect__analyzer],
              'lsi__vect__norm': [lsi__vect__norm],
              'lsi__svd__n_components': [lsi__svd__n_components],
              'lsi__svd__algorithm': [lsi__svd__algorithm],
              'lsi__svd__n_iter': [lsi__svd__n_iter],
              'clf__criterion': [clf__criterion],
              'clf__max_depth': [None, 5, 10, 25, 50, 75, 100, 125, 150],
              'clf__min_samples_split': [2, 5, 10, 25],
              'clf__min_samples_leaf': [1, 3, 5, 7, 9, 12, 15]}

search = RandomizedSearchCV(pipe, parameters, n_iter=30, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)


lsi__svd__n_components = results.best_params_['lsi__svd__n_components']
lsi__svd__algorithm = results.best_params_['lsi__svd__algorithm']
lsi__svd__n_iter = results.best_params_['lsi__svd__n_iter']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (TF-IDF LSI / RFC):', search.score(X_test, y_test))

# CountVectorizer / RandomForestClassifer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

rfc = RandomForestClassifier()
vect = CountVectorizer()

pipe = Pipeline([('vect', vect), ('clf', rfc)])

pipe.fit(X_train, y_train)
print('Baseline model (CountVect / RFC):', pipe.score(X_test, y_test))

In [None]:
parameters = {'vect__strip_accents': ['ascii', 'unicode', None],
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                            (1, 4), (1, 5), (1, 6),
                                            (2, 3), (2, 4), (2, 5),
                                            (2, 6), (3, 3), (4, 4)],
              'vect__analyzer': ['word', 'char', 'char_wb']}

search = GridSearchCV(pipe, parameters, cv=5,  n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

vect__ngram_range = results.best_params_['vect__ngram_range']
vect__analyzer = results.best_params_['vect__analyzer']
vect__strip_accents = results.best_params_['vect__strip_accents']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation:', search.score(X_test, y_test))

In [None]:
parameters = {'vect__ngram_range': [vect__ngram_range],
              'vect__analyzer': [vect__analyzer],
              'vect__strip_accents': [vect__strip_accents],
              'clf__n_estimators': [25, 50, 100, 125, 150, 175, 200],
              'clf__max_features': [None, 'sqrt', 'log2', 50, 100, 200, 250],
              'clf__max_samples': [None, 25, 35, 50, 65, 70, 100, 125],
              'clf__class_weight': ['balanced', 'balanced_subsample'],
              'clf__oob_score': [True, False]}

search = RandomizedSearchCV(pipe, parameters, n_iter=40, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)


clf__n_estimators = results.best_params_['clf__n_estimators']
clf__max_features = results.best_params_['clf__max_features']
clf__max_samples = results.best_params_['clf__max_samples']
clf__class_weight = results.best_params_['clf__class_weight']
clf__oob_score = results.best_params_['clf__oob_score']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (CountVectorizer / RFC):', search.score(X_test, y_test))

# CountVectorizer / Multinomial Naive Bayes

In [None]:
mnb = MultinomialNB()
vect = CountVectorizer()

pipe = Pipeline([('vect', vect), ('clf', mnb)])

pipe.fit(X_train, y_train)
print('Baseline model accuracy (CV/MNB):', pipe.score(X_test, y_test))

In [None]:
parameters = {'vect__strip_accents': ['ascii', 'unicode', None],
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                            (1, 4), (1, 5), (1, 6),
                                            (2, 3), (2, 4), (2, 5),
                                            (2, 6), (3, 3), (4, 4)],
              'vect__analyzer': ['word', 'char', 'char_wb'],
              'clf__alpha': [0.0, 0.125, 0.25, 0.5, 0.75, 1.0],
              'clf__fit_prior': [True, False]}

search = RandomizedSearchCV(pipe, parameters, cv=5, n_iter=40, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

vect__ngram_range = results.best_params_['vect__ngram_range']
vect__analyzer = results.best_params_['vect__analyzer']
vect__norm = results.best_params_['vect__norm']
clf__alpha = results.best_params_['clf__alpha']
clf__fit_prior = results.best_params_['clf__fit_prior']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (CountVectorizer / MNB):', search.score(X_test, y_test))

# LSI / Random Forest Classifier (CountVectorizer)

In [None]:
# Instantiate
svd = TruncatedSVD()
rfc = RandomForestClassifier()
vect = CountVectorizer()

lsi = Pipeline([('vect', vect), ('svd', svd)])

pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

In [None]:
parameters = {'lsi__vect__strip_accents': ['ascii', 'unicode', None],
              'lsi__vect__ngram_range': [(1, 1), (1, 2), (1, 3),
                                         (1, 4), (1, 5), (1, 6),
                                         (2, 3), (2, 4), (2, 5),
                                         (2, 6), (3, 3), (4, 4)],
              'lsi__vect__analyzer': ['word', 'char', 'char_wb']}

search = RandomizedSearchCV(pipe, parameters, cv=5, n_iter=40, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

lsi__vect__strip_accents = results.best_params_['lsi__vect__strip_accents']
lsi__vect__ngram_range = results.best_params_['lsi__vect__ngram_range']
lsi__vect__analyzer = results.best_params_['lsi__vect__analyzer']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation:', search.score(X_test, y_test))

In [None]:
parameters = {'lsi__vect__strip_accents': [lsi__vect__strip_accents],
              'lsi__vect__analyzer': [lsi__vect__analyzer],
              'lsi__vect__ngram_range': [lsi__vect__ngram_range],
              'lsi__svd__n_components': [2, 5, 25, 50, 75, 100, 125, 150],
              'lsi__svd__algorithm': ['arpack', 'randomized'],
              'lsi__svd__n_iter': [5, 25, 50, 75, 100, 125, 150]}

search = RandomizedSearchCV(pipe, parameters, n_iter=35, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)


lsi__svd__n_components = results.best_params_['lsi__svd__n_components']
lsi__svd__algorithm = results.best_params_['lsi__svd__algorithm']
lsi__svd__n_iter = results.best_params_['lsi__svd__n_iter']

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation:', search.score(X_test, y_test))

In [None]:
parameters = {'lsi__vect__strip_accents': [lsi__vect__strip_accents],
              'lsi__vect__analyzer': [lsi__vect__analyzer],
              'lsi__vect__ngram_range': [lsi__vect__ngram_range],
              'lsi__svd__n_components': [lsi__svd__n_components],
              'lsi__svd__algorithm': [lsi__svd__algorithm],
              'lsi__svd__n_iter': [lsi__svd__n_iter],
              'clf__n_estimators': [25, 50, 100, 125, 150, 175, 200],
              'clf__max_features': [None, 'sqrt', 'log2', 50, 100, 200, 250],
              'clf__max_samples': [None, 25, 35, 50, 65, 70, 100, 125],
              'clf__class_weight': ['balanced', 'balanced_subsample'],
              'clf__oob_score': [True, False]}

search = RandomizedSearchCV(pipe, parameters, n_iter=40, cv=5, n_jobs=-1, verbose=False)
results = search.fit(X_train, y_train)

print('Best Score:', results.best_score_)
print('Achieved With Parameters:', results.best_params_)
print('Evaluation (LSI + CountVectorizer / RFC):', search.score(X_test, y_test))