# Gradient Boosting Classifer - Grid Searching and Stats

### Cleaning Data, Creating Feature Data and Vectorizing Data

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("spam.csv", encoding="latin-1")
data = data[['v1', 'v2']][:3000]
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5712,5713,5714,5715,5716,5717,5718,5719,5720,5721
0,92,9.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Grid Searching

In this section of the code, we are using a TFIDF verctorized data set and exploring the different combinations of some parameters for the Gradient Boosting classifier to see the more effective values

In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)

In [4]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    est, max_depth, lr, round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_pred),3)))

In [5]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.013 / Accuracy: 0.87
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.967 / Recall: 0.734 / Accuracy: 0.962
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.925 / Recall: 0.785 / Accuracy: 0.963


  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.868
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.942 / Recall: 0.823 / Accuracy: 0.97
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.859 / Recall: 0.772 / Accuracy: 0.953
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.013 / Accuracy: 0.87
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.903 / Recall: 0.823 / Accuracy: 0.965
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.893 / Recall: 0.848 / Accuracy: 0.967
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.013 / Accuracy: 0.87
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.893 / Recall: 0.848 / Accuracy: 0.967
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.823 / Recall: 0.823 / Accuracy: 0.953
Est: 100 / Depth: 3 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.494 / Accuracy: 0.933
Est: 100 / Depth: 3 / LR: 0.1 ---- Precision: 0.968 / Recall: 0.772 / Accuracy: 0.967
Est: 100 / Depth: 3 / LR: 1 ---- Precision: 0.88 / Recall: 0.835 / Accur

## GridSearchCV 

In this section of the code, we will test two different things. First we will compare Count vectorizig and TFIDF vectorizing and second we are testing the more effective parameter value combinations

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

##### Parameter Combinations

In [7]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}
gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)

##### GridSearchCV with TFIDF

In [8]:
cv_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,253.394307,1.808955,0.169778,0.008996,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.973333,0.961667,0.956667,0.97,0.965,0.965333,0.005907,1
5,398.306122,26.470658,0.165543,0.018715,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.965,0.965,0.961667,0.968333,0.965,0.965,0.002108,2
2,239.765739,3.475586,0.164622,0.003022,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965,0.965,0.953333,0.97,0.968333,0.964333,0.005831,3
0,164.884285,1.029559,0.163117,0.009703,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.968333,0.966667,0.953333,0.968333,0.963333,0.964,0.005637,4
3,371.780672,3.836656,0.181663,0.009975,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.968333,0.963333,0.955,0.97,0.963333,0.964,0.005228,4


##### GridSearchCV with Count Vectorizer

In [9]:
cv_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,252.447042,1.844441,0.187785,0.008389,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.973333,0.968333,0.96,0.97,0.963333,0.967,0.004761,1
3,370.888881,5.343067,0.197801,0.008662,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.97,0.966667,0.96,0.971667,0.966667,0.967,0.004,2
0,163.566576,2.662382,0.168292,0.021713,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.966667,0.965,0.958333,0.968333,0.965,0.964667,0.003399,3
2,237.72257,2.476758,0.189264,0.005621,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965,0.965,0.958333,0.97,0.965,0.964667,0.003712,3
5,424.962153,17.682073,0.165375,0.018714,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.966667,0.965,0.955,0.97,0.963333,0.964,0.005011,5


## Best Gradient Boosting Case

In this section of the code, we are using the best parameter combination determined from the sections above and getting the fit time and pred time which will be later used to compare with different classifers.

#### Splitting Training and Testing Data Set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

In [None]:
#### Vectorizing Training and Testing Data Sets

In [11]:
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,5028,5029,5030,5031,5032,5033,5034,5035,5036,5037
0,36,5.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,59,5.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22,31.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21,4.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Evaluating the model

In [12]:
import time
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 228.894 / Predict time: 0.202 ---- Precision: 0.9 / Recall: 0.863 / Accuracy: 0.972
