In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/DAAG/spam7.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,crl.tot,dollar,bang,money,n000,make,yesno
0,1,278,0.0,0.778,0.0,0.0,0.0,y
1,2,1028,0.18,0.372,0.43,0.43,0.21,y
2,3,2259,0.184,0.276,0.06,1.16,0.06,y


In [4]:
def spam_to_num(value):
    if value=='y': return 1
    else: return 0

In [5]:
df['spam'] = df['yesno'].apply(lambda x: spam_to_num(x))

In [6]:
df.drop(['Unnamed: 0', 'yesno'], axis=1, inplace=True)

In [7]:
df.head(3)

Unnamed: 0,crl.tot,dollar,bang,money,n000,make,spam
0,278,0.0,0.778,0.0,0.0,0.0,1
1,1028,0.18,0.372,0.43,0.43,0.21,1
2,2259,0.184,0.276,0.06,1.16,0.06,1


In [8]:
X = df.drop(['spam'], axis=1)
y = df['spam']

for i in df.drop(['spam'], axis=1): 
    for j in df.drop(['spam'], axis=1):
        if ((i!=j) and ((j+'_'+i) not in X)):
            X[i+'_'+j]=df.drop(['spam'], axis=1)[i]*df.drop(['spam'], axis=1)[j]

In [9]:
X

Unnamed: 0,crl.tot,dollar,bang,money,n000,make,crl.tot_dollar,crl.tot_bang,crl.tot_money,crl.tot_n000,...,dollar_bang,dollar_money,dollar_n000,dollar_make,bang_money,bang_n000,bang_make,money_n000,money_make,n000_make
0,278,0.000,0.778,0.00,0.00,0.00,0.000,216.284,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
1,1028,0.180,0.372,0.43,0.43,0.21,185.040,382.416,442.04,442.04,...,0.066960,0.07740,0.07740,0.03780,0.15996,0.15996,0.07812,0.1849,0.0903,0.0903
2,2259,0.184,0.276,0.06,1.16,0.06,415.656,623.484,135.54,2620.44,...,0.050784,0.01104,0.21344,0.01104,0.01656,0.32016,0.01656,0.0696,0.0036,0.0696
3,191,0.000,0.137,0.00,0.00,0.00,0.000,26.167,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
4,191,0.000,0.135,0.00,0.00,0.00,0.000,25.785,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,88,0.000,0.000,0.00,0.00,0.31,0.000,0.000,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
4597,14,0.000,0.353,0.00,0.00,0.00,0.000,4.942,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
4598,118,0.000,0.000,0.00,0.00,0.30,0.000,0.000,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000
4599,78,0.000,0.000,0.00,0.00,0.96,0.000,0.000,0.00,0.00,...,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.0000,0.0000,0.0000


In [10]:
y

0       1
1       1
2       1
3       1
4       1
       ..
4596    0
4597    0
4598    0
4599    0
4600    0
Name: spam, Length: 4601, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [12]:
GBC = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, 
                                 min_samples_leaf=1, subsample=1,max_features=None, random_state=42)

In [13]:
GBC.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42, subsample=1)

In [14]:
y_pred = GBC.predict(X_test)

In [15]:
accuracy_score(y_test, y_pred)

0.8653637350705755

In [34]:
pd.Series(GBC.feature_importances_, index=X.columns).sort_values(ascending=False).plot(kind='bar')

crl.tot_bang      0.614877
dollar            0.140770
dollar_bang       0.060567
bang              0.042572
crl.tot_money     0.040245
crl.tot           0.029716
n000              0.024777
crl.tot_dollar    0.008679
dollar_money      0.007222
crl.tot_make      0.006539
bang_money        0.006469
money             0.006090
dollar_n000       0.003422
bang_make         0.001965
dollar_make       0.001694
crl.tot_n000      0.001366
bang_n000         0.001176
make              0.001166
n000_make         0.000661
money_n000        0.000027
money_make        0.000000
dtype: float64

In [29]:
X.columns

Index(['crl.tot', 'dollar', 'bang', 'money', 'n000', 'make', 'crl.tot_dollar',
       'crl.tot_bang', 'crl.tot_money', 'crl.tot_n000', 'crl.tot_make',
       'dollar_bang', 'dollar_money', 'dollar_n000', 'dollar_make',
       'bang_money', 'bang_n000', 'bang_make', 'money_n000', 'money_make',
       'n000_make'],
      dtype='object')

In [45]:
parameters  = {'learning_rate':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
              'n_estimators':[100, 250, 500, 750, 1000, 1250, 1500, 1750]}

GS = GridSearchCV(GBC, parameters, scoring='accuracy', n_jobs=-1, cv=5)

In [47]:
GS.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(random_state=42, subsample=1),
             n_jobs=-1,
             param_grid={'learning_rate': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'n_estimators': [100, 250, 500, 750, 1000, 1250, 1500,
                                          1750]},
             scoring='accuracy')

In [52]:
GS.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1250,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 42,
 'subsample': 1,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [54]:
GS.best_estimator_

GradientBoostingClassifier(learning_rate=0.01, n_estimators=1250,
                           random_state=42, subsample=1)

In [56]:
y_pred = GS.predict(X_test)

In [57]:
accuracy_score(y_test,y_pred)

0.8642779587404995

In [58]:
GS.score(X_test, y_test)

0.8642779587404995

GBC = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2, 
                                 min_samples_leaf=1, subsample=1,max_features=None, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

parameters  = {'learning_rate':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
              'n_estimators':[100, 250, 500, 750, 1000, 1250, 1500, 1750]}

GS = GridSearchCV(GBC, parameters, scoring='accuracy', n_jobs=-1, cv=5)
GS.fit(X_train, y_train)
y_pred = GS.predict(X_test)
accuracy_score(y_test,y_pred)