In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [2]:
train_data = pd.read_csv("train_data.csv", delimiter='\t', index_col=0)

In [3]:
train_data = train_data.reset_index(drop=True)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 32 columns):
ID           1319 non-null int64
Age          1319 non-null float64
Gender       1319 non-null float64
Education    1319 non-null float64
Country      1319 non-null float64
Ethnicity    1319 non-null float64
Nscore       1319 non-null float64
Escore       1319 non-null float64
Oscore       1319 non-null float64
Ascore       1319 non-null float64
Cscore       1319 non-null float64
Impulsive    1319 non-null float64
SS           1319 non-null float64
Alcohol      1319 non-null object
Amphet       1319 non-null object
Amyl         1319 non-null object
Benzos       1319 non-null object
Caff         1319 non-null object
Cannabis     1319 non-null object
Choc         1319 non-null object
Coke         1319 non-null object
Crack        1319 non-null object
Ecstacy      1319 non-null object
Heroin       1319 non-null object
Ketamine     1319 non-null object
Legalh       1319 non-null objec

In [5]:
# Drop ID, Chocolate, the fake drug Semer, and legal substances
train_data.drop(['ID', 'Choc', 'Semer', 'Alcohol', 'Nicotine', 'Caff','Legalh'], axis=1, inplace=True)

In [6]:
for column in train_data.loc[:,'Amphet':]:
    # get label encoding for column
    train_data[column] = train_data[column].astype('category').cat.codes
    train_data[column] = train_data[column].astype('float64')

In [7]:
def is_drug_user(row):
    row = row['Amphet':]
    num_zeros = (row == 0).astype(bool).sum()
    if num_zeros == row.size:
        return False
    return True

In [8]:
train_data['Drug User'] = train_data.apply(is_drug_user, axis=1)

In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 26 columns):
Age          1319 non-null float64
Gender       1319 non-null float64
Education    1319 non-null float64
Country      1319 non-null float64
Ethnicity    1319 non-null float64
Nscore       1319 non-null float64
Escore       1319 non-null float64
Oscore       1319 non-null float64
Ascore       1319 non-null float64
Cscore       1319 non-null float64
Impulsive    1319 non-null float64
SS           1319 non-null float64
Amphet       1319 non-null float64
Amyl         1319 non-null float64
Benzos       1319 non-null float64
Cannabis     1319 non-null float64
Coke         1319 non-null float64
Crack        1319 non-null float64
Ecstacy      1319 non-null float64
Heroin       1319 non-null float64
Ketamine     1319 non-null float64
LSD          1319 non-null float64
Meth         1319 non-null float64
Mushrooms    1319 non-null float64
VSA          1319 non-null float64
Drug User    1319 

In [10]:
# Grid search for random forest
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 100, num = 20)] + [None]
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 20)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
grid = {'max_features': max_features,
        'max_depth': max_depth,
        'n_estimators': n_estimators,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap}

In [11]:
forest = RandomForestClassifier()
forest_grid_search = RandomizedSearchCV(estimator = forest, param_distributions = grid, 
                               n_iter = 100, cv = 10, verbose=2, random_state=0, n_jobs = 4)

In [12]:
X_train = train_data.loc[:, 'Age':'SS']
y_train = train_data['Drug User']

In [13]:
forest_grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   17.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  7.4min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=Non

In [14]:
forest_grid_search.best_params_

{'n_estimators': 621,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': True}

In [15]:
forest = RandomForestClassifier(n_estimators=621, min_samples_split=2, 
                                min_samples_leaf=2, max_features='auto', 
                                max_depth=30, bootstrap=True, random_state=0)

In [16]:
scores = cross_val_score(forest, X_train, y_train, scoring='f1', cv=10)

In [18]:
print(scores.mean())

0.9187101168756463
