In [136]:
import numpy as np
import pandas as pd

#### Solution
We start by loading the dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
# Retrieve data
news_data = fetch_20newsgroups()

In [3]:
# See what an element looks like
news_data['data'][0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer
def compute_tfidf(data):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    return vectorizer.fit_transform(news_data.data)

In [156]:
data_vec = compute_tfidf(news_data.data)
data_vec

We can also see the details by calling "vectorizer.vocabulary_" but there is a lose of values

In [6]:
news_data.target

array([7, 4, 4, ..., 3, 1, 8])

Before cross-validation, we need to split the data according to the description of the problem. To do so, we define a helper function split_data

In [152]:
from sklearn.model_selection import train_test_split
def split_data(ratio_train_testval, ratio_test_val):
    # Split the data to have 80% train, 20% test and val
    x_train, x_test_val, y_train, y_test_val = train_test_split(data_vec, news_data.target, test_size = ratio_train_testval)
    # Split test and val to have 10% each
    x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size = ratio_test_val)
    return x_train, y_train, x_test, y_test, x_val, y_val

In [153]:
x_train, y_train, x_test, y_test, x_val, y_val = split_data(0.2, 0.5)

In [10]:
np.shape(x_test_val)

(2263, 129791)

In [11]:
np.shape(x_train)

(9051, 129791)

Now we can cross-validate using the following function

In [159]:
from sklearn.ensemble import RandomForestClassifier

# Cross validate hyper parameters using train and validation data
def cross_valid():
    
    n_estim, dep, max_score = 0, 0, 0

    for estim in range(55, 65, 5):
        for depth in range(155, 170, 5):
            regr = RandomForestClassifier(n_estimators=estim, max_depth=depth, random_state = 0)
            regr.fit(x_train, y_train)
            score = regr.score(x_val, y_val)
            print(score)
            if(score > max_score):
                max_score = score
                n_estim = estim
                dep = depth
    return n_estim, dep, max_score        

In [160]:
n_estim, dep, max_score = cross_valid()

0.848056537102
0.833038869258
0.840989399293
0.848939929329
0.832155477032
0.841872791519


In [161]:
(n_estim, dep, max_score)

(60, 155, 0.84893992932862195)

Let's see how our hyperparameters perform by testing them on the test data

In [162]:
def test_prediction(n_estimators_, max_depth_):
    # Use cross-validated best parameters to predict for test set
    regr = RandomForestClassifier(n_estimators=n_estimators_, max_depth=max_depth_, random_state = 0)
    X_train = regr.fit(x_train, y_train)
    return regr.score(x_test, y_test)

In [163]:
test_prediction(n_estim, dep)

0.85764809902740935

This is quite a good result. Now we only need to find the most relevant features

In [164]:
# Returns an array containing the indices of the biggest n elements
# in the second array parameter, sorted in descending order
def get_n_best_features_idx(n, features_importance):
    return np.argsort(features_importance)[::-1][:n]

In [165]:
# Set the number of best features you want
n_best = 20

# Computes a dataframe containing the feature names with highest score in descending order
def get_n_best_feature_names(n, dictionary, features_importance):
    indices = get_n_best_features(n, regr.feature_importances_)
    sorted_dict = sorted(list(dictionary.items()), key = lambda tup: tup[1])
    names = np.array(list(map(lambda x: x[0],sorted_dict)))[indices]
    return pd.DataFrame([names, features_importance[indices]])

With the two helper functions above, we can simply print the result dataframe to see which words are the most relevant for classification

In [166]:
get_n_best_feature_names(n_best, vectorizer.vocabulary_, regr.feature_importances_).transpose()

Unnamed: 0,0,1
0,windows,0.00719809
1,sale,0.00658377
2,bike,0.00554266
3,dod,0.00538623
4,space,0.00431327
5,hockey,0.00397577
6,car,0.00390783
7,encryption,0.00382953
8,mac,0.00336992
9,god,0.00332612
