### Feature Generation and Clustering
https://github.com/Lukede9/Thinkful/blob/master/Bootcamp/Unit%204%20Capstone%20Feature%20Generation.ipynb

In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score

from sklearn.preprocessing import normalize

- bow_train = BoW for training set
- bow_valid = BoW for the validation set
- tfidf_features = tf-idf for the training set
- tfidf_valid = tf-idf for the validation set
- tfbow_train = both for training set
- tfbow_valid = both for validation set

- best_clusters_train = clusters for the training set
- best_clusters_valid = clusters for the validation set

In [2]:
# Bag of Words Features
bow_train = pd.read_csv('bow_train.csv', index_col=0)
bow_valid = pd.read_csv('bow_valid.csv', index_col=0)

# tf-idf features
tfidf_features = pd.read_csv('tfidf_features.csv', index_col=0)
tfidf_valid = pd.read_csv('tfidf_valid.csv', index_col=0)

# Both features
tfbow_train = pd.read_csv('tfbow_train.csv', index_col=0)
tfbow_valid = pd.read_csv('tfbow_valid.csv', index_col=0)

# Importing cluster DataFrame
best_clusters_train = pd.read_csv('best_clusters_train.csv', index_col=0)
best_clusters_valid = pd.read_csv('best_clusters_valid.csv', index_col=0)


# Bag of Words + best_clusters
bowclusters_train = pd.concat([bow_train, best_clusters_train], axis=1)
bowclusters_test = pd.concat([bow_valid, best_clusters_valid], axis=1)

# tf-idf + best clusters
tfidf_clusters_train = pd.concat([tfidf_features, best_clusters_train], axis=1)
tfidf_clusters_valid = pd.concat([tfidf_valid, best_clusters_valid], axis=1)

# Both + best clusters
tfbow_feat_clusters_train = pd.concat([tfbow_train, best_clusters_train], axis=1)
tfbow_feat_clusters_valid = pd.concat([tfbow_valid, best_clusters_valid], axis=1)

In [3]:
target = bow_train['genre']

In [4]:
# Train-test split here is for the DataFrames without clusters

bowX_train, bowX_test, bowy_train, bowy_test = train_test_split(bow_train.drop(columns=['genre', 'text', 'article_words500']), 
                                                                target, 
                                                                test_size=.25, 
                                                                stratify=target, 
                                                                random_state=24
)

tfidfX_train, tfidfX_test, tfidfy_train, tfidfy_test = train_test_split(tfidf_features,
                                                                        target,
                                                                        test_size=.25,
                                                                        stratify=target, 
                                                                        random_state=24
)

tfbowX_train, tfbowX_test, tfbowy_train, tfbowy_test = train_test_split(tfbow_train.drop(columns=['genre', 'text', 'article_words500']), 
                                                                        target,
                                                                        test_size=.25,
                                                                        stratify=target, 
                                                                        random_state=24
)

# Supervised Models using BoW Features

#### Random Forest

In [5]:
rfc = ensemble.RandomForestClassifier(100)
rfc_train = rfc.fit(bowX_train, bowy_train)

print('Training score:', rfc.score(bowX_train, bowy_train))
print('Test set score:', rfc.score(bowX_test, bowy_test))

Training score: 1.0
Test set score: 0.35714285714285715


#### Logistic Regression

In [6]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=500)
lr_train = lr.fit(bowX_train, bowy_train)

print('Training score:', lr.score(bowX_train, bowy_train))
print('Test set score:', lr.score(bowX_test, bowy_test))

Training score: 1.0
Test set score: 0.35714285714285715


#### Gradient Boosting

In [7]:
clf = ensemble.GradientBoostingClassifier()
clf_train = clf.fit(bowX_train, bowy_train)

print('Training set score:', clf.score(bowX_train, bowy_train))
print('Test set score:', clf.score(bowX_test, bowy_test))

Training set score: 1.0
Test set score: 0.2619047619047619


# Supervised Models using tf-idf Features

#### Random Forest

In [8]:
rfc1 = ensemble.RandomForestClassifier(100)
rfc1_train = rfc1.fit(tfidfX_train, tfidfy_train)

print('Training score:', rfc1.score(tfidfX_train, tfidfy_train))
print('Test set score:', rfc1.score(tfidfX_test, tfidfy_test))

Training score: 1.0
Test set score: 0.30952380952380953


#### Logistic Regression

In [9]:
lr1 = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=500)
lr1_train = lr1.fit(tfidfX_train, tfidfy_train)

print('Training score:', lr1.score(tfidfX_train, tfidfy_train))
print('Test set score:', lr1.score(tfidfX_test, tfidfy_test))

Training score: 1.0
Test set score: 0.35714285714285715


#### Gradient Boosting

In [10]:
clf1 = ensemble.GradientBoostingClassifier()
clf1_train = clf1.fit(tfidfX_train, tfidfy_train)

print('Training set score:', clf1.score(tfidfX_train, tfidfy_train))
print('Test set score:', clf1.score(tfidfX_test, tfidfy_test))

Training set score: 1.0
Test set score: 0.23809523809523808


# Supervised Models using BoW and tf-idf Features

#### Random Forest

In [11]:
rfc2 = ensemble.RandomForestClassifier(100)
rfc2_train = rfc2.fit(tfbowX_train, tfbowy_train)

print('Training score:', rfc2.score(tfbowX_train, tfbowy_train))
print('Test set score:', rfc2.score(tfbowX_test, tfbowy_test))

Training score: 1.0
Test set score: 0.3333333333333333


#### Logistic Regression

In [12]:
lr2 = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=500)
lr2_train = lr2.fit(tfbowX_train, tfbowy_train)

print('Training score:', lr2.score(tfbowX_train, tfbowy_train))
print('Test set score:', lr2.score(tfbowX_test, tfbowy_test))

Training score: 1.0
Test set score: 0.35714285714285715


#### Gradient Boosting

In [13]:
clf2 = ensemble.GradientBoostingClassifier()
clf2_train = clf2.fit(tfbowX_train, tfbowy_train)

print('Training set score:', clf2.score(tfbowX_train, tfbowy_train))
print('Test set score:', clf2.score(tfbowX_test, tfbowy_test))

Training set score: 1.0
Test set score: 0.14285714285714285


# Train-Test Split for various permutations of features/clusters

In [14]:
# BoW + (tf-idf based) k-means clusters
bowkmX_train, bowkmX_test, bowkmy_train, bowkmy_test = train_test_split(bowclusters_train.drop(columns=['genre', 'text', 'article_words500']), 
                                                                target, 
                                                                test_size=.25, 
                                                                stratify=target, 
                                                                random_state=24
)


# tf-idf + (tf-idf based) k-means clusters
tfkmX_train, tfkmX_test, tfkmy_train, tfkmy_test = train_test_split(tfidf_clusters_train,
                                                                        target,
                                                                        test_size=.25,
                                                                        stratify=target, 
                                                                        random_state=24
)                      

# Supervised Models for BoW Features + K-Means Clusters

In [15]:
crfc1 = ensemble.RandomForestClassifier(100)
crfc1_train = crfc1.fit(bowkmX_train, bowkmy_train)

print('Random Forest')
print('Training score:', crfc1.score(bowkmX_train, bowkmy_train))
print('Test set score:', crfc1.score(bowkmX_test, bowkmy_test))

clr1 = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=500)
clr1_train = clr1.fit(bowkmX_train, bowkmy_train)

print('Logistic Regression')
print('Training score:', clr1.score(bowkmX_train, bowkmy_train))
print('Test set score:', clr1.score(bowkmX_test, bowkmy_test))

cclf1 = ensemble.GradientBoostingClassifier()
cclf1_train = cclf1.fit(bowkmX_train, bowkmy_train)

print('Gradient Boosting Classifier')
print('Training set score:', cclf1.score(bowkmX_train, bowkmy_train))
print('Test set score:', cclf1.score(bowkmX_test, bowkmy_test))

Random Forest
Training score: 1.0
Test set score: 0.2619047619047619
Logistic Regression
Training score: 1.0
Test set score: 0.35714285714285715
Gradient Boosting Classifier
Training set score: 1.0
Test set score: 0.23809523809523808


# Supervised Models using tf-idf Features and K-Means Clusters

In [16]:
crfc2 = ensemble.RandomForestClassifier(100)
crfc2_train = crfc2.fit(tfkmX_train, tfkmy_train)

print('Random Forest')
print('Training score:', crfc2.score(tfkmX_train, tfkmy_train))
print('Test set score:', crfc2.score(tfkmX_test, tfkmy_test))

clr2 = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=500)
clr2_train = clr2.fit(tfkmX_train, tfkmy_train)

print('Logistic Regression')
print('Training score:', clr2.score(tfkmX_train, tfkmy_train))
print('Test set score:', clr2.score(tfkmX_test, tfkmy_test))

cclf2 = ensemble.GradientBoostingClassifier()
cclf2_train = cclf2.fit(tfkmX_train, tfkmy_train)

print('Gradient Boosting Classifier')
print('Training set score:', cclf2.score(tfkmX_train, tfkmy_train))
print('Test set score:', cclf2.score(tfkmX_test, tfkmy_test))

Random Forest
Training score: 1.0
Test set score: 0.38095238095238093
Logistic Regression
Training score: 0.9105691056910569
Test set score: 0.2857142857142857
Gradient Boosting Classifier
Training set score: 1.0
Test set score: 0.21428571428571427


### Summary:

The best feature/model combinations were:
- RF/BoW .4047
- RF/BoW & tf-idf .3809
- RF/BoW + kmeans clusters .4523

- Logistic regression scored the same on most of them

### I am going to play around a bit with the regularization/min samples parameters.

# Fine-Tuning the best models

### Fine-Tuning the Logistic Regression model.

In [17]:
params = {'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
          'multi_class' : ['auto'], 
          'max_iter' : [250, 500, 1000]}

lr_grid = GridSearchCV(lr, param_grid=params, cv=5)
lr_grid.fit(bowX_train, bowy_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'multi_class': ['auto'], 'max_iter': [250, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
print(lr_grid.best_score_)
print(lr_grid.best_params_)

0.37398373983739835
{'max_iter': 250, 'multi_class': 'auto', 'solver': 'sag'}


In [19]:
# Need to do another gridsearch because some of the solvers were not compatible with both penalty options
params2 = {'penalty' : ['l1', 'l2'],
          'solver' : ['liblinear', 'saga'],
          'multi_class' : ['auto'],
          'max_iter' : [250]}

lr_grid1 = GridSearchCV(lr, param_grid=params2, cv=5)
lr_grid1.fit(bowX_train, bowy_train)

print(lr_grid1.best_score_)
print(lr_grid1.best_params_)



0.37398373983739835
{'max_iter': 250, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'saga'}




In [20]:
lr_best = lr_grid1.best_estimator_

### Fine-tuning the Random Forest Model

In [21]:
rf_params = {'min_samples_split' : [2, 3, 5],
            'n_estimators' : [200, 300]}

rf_grid = GridSearchCV(crfc1, param_grid=rf_params, cv=5)
rf_grid.fit(bowkmX_train, bowkmy_train)

print(rf_grid.best_params_, rf_grid.best_score_)



{'min_samples_split': 5, 'n_estimators': 200} 0.43902439024390244


In [22]:
r_forest = rf_grid.best_estimator_

# Validation Sets

I already brought in the validation sets after they went through all the necessary transformations. Now it is time to see how much the models are overfitting.

In [23]:
X = bowclusters_test.drop(columns=['text', 'genre', 'article_words500'])
y = bowclusters_test['genre']

In [24]:
r_forest.fit(bowkmX_train, bowkmy_train)
r_forest.score(X, y)

0.43636363636363634

I did very little optimization of the models so it is no surprise that it does not seem to be overfitting at all.

In [25]:
lr_best.fit(bowkmX_train, bowkmy_train)
lr_best.score(X, y)



0.4

It is also worth noting that my corpus/sample size is very small.