### 1. Import Packages

In [1]:
import nltk
import numpy as np
import pandas as pd
import os, sys

### 2. Import data and libraries 

In [2]:
df = pd.read_csv('./emails.train.csv')
#df_pos = df[df['spam']==1]

from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re

from sklearn.metrics import average_precision_score, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline

### 3. Defining

In [3]:
re_tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b') # tokenize words that are not numbers
mystopwords = set(stopwords.words('english')) #stop words remover
extras = set(['_', 'subject'])
mystopwords.update(extras)
#ps = PorterStemmer() #stemming words
#stemmer = SnowballStemmer("english") #stemming words
lemmatizer = WordNetLemmatizer() #lemmatizing words

### 4. TFIDF

In [7]:
import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.gaussian_process import GaussianProcessClassifier

# training data dirty 
data = pd.read_csv('./emails.train.csv')

def lemmi(df):
    # lemmatize 
    for i, line in enumerate(df['text']): # so for every line in the email data, df input variable/placeholder
        newline = [] #create new array, to put in the new lemmatized words
        words = line.split() # split the lines into words according to spaces, one string that consists out of multiple words
        for word in words: # for every word in that collection
            word = lemmatizer.lemmatize(word) #replace the word with the lemmatized version of the word
            newline.append(word) # add this into the array
        new = ' '.join(newline) # picks every element of the array and joins them in one string with a space in between, we want to replace the orginal line in the file. 
        df.loc[i, 'text'] = new # you replace the line at the index position (currently looking at) with the new line that is the lemmatized words

        
# This is just a test use
lemmi(data) # use the function to the lemmatized version, data is my actual input        

# Vectorizer 
tvec = TfidfVectorizer(stop_words=mystopwords, tokenizer=re_tokenizer.tokenize, max_features = 100) #min_df=.0025, max_df=.1 starts ignoring the frequent words
X = tvec.fit_transform(data['text']) 
print(tvec.vocabulary_)
#print(X)

{'company': 7, 'market': 49, 'information': 36, 'good': 29, 'make': 47, 'business': 2, 'list': 44, 'use': 90, 'management': 48, 'system': 81, 'see': 75, 'day': 14, 'look': 45, 'new': 56, 'take': 82, 'time': 87, 'request': 72, 'order': 61, 'fax': 22, 'e': 16, 'mail': 46, 'com': 5, 'message': 52, 'get': 28, 'best': 1, 'one': 59, 'research': 73, 'service': 78, 'know': 41, 'price': 67, 'ha': 31, 'risk': 74, 'work': 96, 'may': 50, 'gas': 27, 'well': 95, 'also': 0, 'u': 88, 'forward': 24, 'number': 58, 'project': 69, 'need': 55, 'email': 19, 'wa': 92, 'like': 43, 'want': 93, 'would': 97, 'please': 63, 'thank': 84, 'regard': 71, 'contact': 9, 'name': 54, 'year': 99, 'thanks': 85, 'help': 32, 'j': 38, 'http': 35, 'let': 42, 'p': 62, 'free': 26, 'group': 30, 'www': 98, 'date': 13, 'send': 76, 'program': 68, 'next': 57, 'could': 11, 'power': 65, 'think': 86, 'question': 70, 'th': 83, 'development': 15, 'university': 89, 'houston': 34, 'call': 3, 'week': 94, 'energy': 20, 'option': 60, 'communica

### Define new labels : using cross-validation

In [8]:
# In this cell I clean the dirty version + feature extraction etc. Cross-validation: you can pretend that you have a different training set which you can evaluate your model on
train = pd.read_csv('./emails.train.csv')
test  = pd.read_csv('./emails.test.csv')

lemmi(train)
lemmi(test)

# Get labels
subtrain_X, subval_X = train_test_split(train, test_size = 0.25)

Y_train = subtrain_X['spam']
Y_test  = subval_X['spam']

tfidf = TfidfVectorizer(stop_words=mystopwords, tokenizer=re_tokenizer.tokenize, max_features = 100)

#print("Extracting feature for train ...")
X_train = tfidf.fit_transform(subtrain_X['text'])


#print("Extracting feature for test ...")
X_test  = tfidf.fit_transform(subval_X[ 'text'])

### Models

#### Model 1: SVM

In [9]:
model = SVC()

model.fit(X_train, Y_train) # fit is for our training data, the extracted bag of words

def eval(model, X_test, Y_test, method=''):
    print("====== Performance of: {method} =======".format(method=method))
    
    # Predict decision labels.
    Y_pred  = model.predict(X_test)  #we need this 
    print("Metric[{metric:20s}]  {score:-3f}".format( metric="Accuracy", 
                                              score=accuracy_score(Y_test, Y_pred)) )

    # Predict confidence scores.
    Y_score = model.decision_function(X_test)    
    print("Metric[{metric:20s}]  {score:-3f}".format( metric="Average Precision", 
                                              score=average_precision_score(Y_test, Y_score)) )

    # write to submit format
    outf = 'solution.%s.csv'% method
    with open( outf, 'w') as f:
        f.write('id,spam\n')
        for i in range(len(Y_pred)):
            # print test['id'][0]
            f.write('%s,%s\n' % (test['id'][i], Y_pred[i]) )
    print("[output] "+outf)
    
    
# evaluate current model
eval(model, X_test, Y_test, method='SVM')

Metric[Accuracy            ]  0.769384
Metric[Average Precision   ]  0.347246
[output] solution.SVM.csv


#### Model 2: KNeighborsClassifier

In [10]:
melinda = KNeighborsClassifier(n_neighbors=8, n_jobs=1) # how many calculations it can do at the same time 
melinda.fit(X_train, Y_train) # fit is for our training data, the extracted bag of words

Y_pred = melinda.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.67892644135188862

In [11]:
clf = KNeighborsClassifier()
params = {
    'n_neighbors': [5,10,14], 
    'p': [1, 2] # different 
}

grid_search = GridSearchCV(clf,params, n_jobs=-1)

grid_search.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [5, 10, 14], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
print (grid_search.best_params_)
print (grid_search.best_score_)

{'n_neighbors': 5, 'p': 2}
0.910447761194


#### Model 3: RandomForestClassifier

In [13]:
params = {
    'n_estimators': [10, 20, 50], 
    'max_depth': [1, 20, 5]
}
clf = RandomForestClassifier(params) # initializes the randomforest classifier

grid_search = GridSearchCV(clf,params, n_jobs=-1)

grid_search.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0,
            n_estimators={'n_estimators': [10, 20, 50], 'max_depth': [1, 20, 5]},
            n_jobs=1, oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 20, 50], 'max_depth': [1, 20, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
print (grid_search.best_params_)
print (grid_search.best_score_)

{'max_depth': 20, 'n_estimators': 50}
0.960530679934


In [15]:
#use to combine my estimators 
grid_search.predict_proba(X_test) # it gives you the probability for each class. The first email 0 or 1 no spam or spam

array([[ 0.97951711,  0.02048289],
       [ 0.96447284,  0.03552716],
       [ 0.92456929,  0.07543071],
       ..., 
       [ 0.61055408,  0.38944592],
       [ 0.95986799,  0.04013201],
       [ 0.88393572,  0.11606428]])

In [16]:
# subtrain_X, subval_X = train_test_split(train, test_size = 0.25) # size of the testing set split the original training set 0.75 and validtion set is 0.25

# print(train.shape)
# print(subtrain_X.shape) # this prints the 75 %
# print(subval_X) #without the shape (rows, columns)

In [None]:
pipeline = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  MultinomialNB()) ])

pipeline.fit(data['text'].values, data['class'].values
pipeline.predict(examples) # ['spam', 'ham']