# Gradient Boosting Machine Learning Model

### Gradiant Boosting is also type of ensamble learning which uses iterations and correcting previous mistakes

### Read in Raw Text, and perform all preprocessing steps we did before (cleaning the text, feature engineering and text vectorizing steps)

In [11]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
import os
base_path = 'datasets'
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv(os.path.join(base_path,"SMSSpamCollection.tsv"), sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
x_count = count_vect.fit_transform(data['body_text'])
x_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X_features,data['label'],test_size=0.2)

In [7]:
def train_gb(est,max_depth,lr):
    gb = GradientBoostingClassifier(n_estimators=est,max_depth=max_depth,learning_rate=lr)
    gb_model = gb.fit(x_train,y_train)
    y_pred = gb_model.predict(x_test)
    precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
    precision_r = round(precision,3)
    recall_r = round(recall,3)
    accuracy = round((y_pred==y_test).sum()/len(y_pred),3)
    print(f"Est: {est}, Depth {max_depth} ----- Precision: {precision_r} / Recall: {recall_r} / Accuracy: {accuracy}")

In [8]:
# Supress sklearn warnings (something about the column names not string)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

for n_est in [50,100,150]:
    for max_depth in [3,7,11,15]:
        for lr in [0.01,0.1,1]:
            train_gb(n_est,max_depth,lr)

Est: 50, Depth 3 ----- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.882
Est: 50, Depth 3 ----- Precision: 0.928 / Recall: 0.682 / Accuracy: 0.956
Est: 50, Depth 3 ----- Precision: 0.904 / Recall: 0.788 / Accuracy: 0.965
Est: 50, Depth 7 ----- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.882
Est: 50, Depth 7 ----- Precision: 0.938 / Recall: 0.803 / Accuracy: 0.97
Est: 50, Depth 7 ----- Precision: 0.884 / Recall: 0.811 / Accuracy: 0.965
Est: 50, Depth 11 ----- Precision: 1.0 / Recall: 0.015 / Accuracy: 0.883
Est: 50, Depth 11 ----- Precision: 0.929 / Recall: 0.788 / Accuracy: 0.968
Est: 50, Depth 11 ----- Precision: 0.872 / Recall: 0.826 / Accuracy: 0.965
Est: 50, Depth 15 ----- Precision: 1.0 / Recall: 0.023 / Accuracy: 0.884
Est: 50, Depth 15 ----- Precision: 0.914 / Recall: 0.803 / Accuracy: 0.968
Est: 50, Depth 15 ----- Precision: 0.901 / Recall: 0.826 / Accuracy: 0.969
Est: 100, Depth 3 ----- Precision: 0.959 / Recall: 0.538 / Accuracy: 0.943
Est: 100, Depth 3 ----- Precision: 0.934 / R

In [10]:
# comparing TF-IDF models
from sklearn.model_selection import GridSearchCV
gb = GradientBoostingClassifier()
params = {
    'n_estimators':[100,150],
    'max_depth':[7,11,15],
    'learning_rate':[0.1]
}
gs = GridSearchCV(gb,params,cv=5,n_jobs=-1)
cv_fit = gs.fit(X_tfidf,data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values("mean_test_score",ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,30.661765,1.04895,0.009731,0.000374,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.970377,0.971275,0.967655,0.964061,0.96496,0.967665,0.002853,1
0,21.931205,0.924482,0.008831,0.001365,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.966786,0.972172,0.967655,0.964061,0.96496,0.967127,0.002827,2
5,46.368145,0.195729,0.013201,0.001326,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.968582,0.975763,0.966757,0.959569,0.964061,0.966946,0.005352,3
3,42.454853,1.020635,0.011398,0.000864,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964991,0.974865,0.965858,0.962264,0.963163,0.966228,0.004503,4
4,35.691907,2.730993,0.011317,0.001964,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.967684,0.97307,0.96496,0.962264,0.962264,0.966048,0.004045,5


In [12]:
# comparing count vectorizers models
gb = GradientBoostingClassifier()
params = {
    'n_estimators':[100,150],
    'max_depth':[7,11,15],
    'learning_rate':[0.1]
}
gs = GridSearchCV(gb,params,cv=5,n_jobs=-1)
cv_fit = gs.fit(x_count_feat,data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values("mean_test_score",ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,221.36801,2.679248,0.356509,0.045779,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.967684,0.979354,0.97035,0.966757,0.966757,0.97018,0.004772,1
3,296.829251,7.646183,0.274738,0.030193,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.963196,0.977558,0.971249,0.968553,0.967655,0.969642,0.004732,2
5,291.10703,2.55844,0.194739,0.017965,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.964991,0.976661,0.969452,0.968553,0.966757,0.969283,0.003995,3
2,211.883046,5.562971,0.315983,0.042111,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965889,0.977558,0.966757,0.967655,0.967655,0.969103,0.004278,4
0,142.760125,2.038098,0.342887,0.045026,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.963196,0.980251,0.969452,0.966757,0.964061,0.968743,0.006158,5
