# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [17]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1',usecols=[0,1])

# "讀取資料集"

dataset['v1'] = dataset['v1'].map(lambda x: 0 if x == 'ham' else 1)
dataset 

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### 取出訓練內文與標註

In [18]:
X = dataset.iloc[:,1].values
Y = dataset.iloc[:,0].values

In [19]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]


In [20]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [25]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

"""可以參考課程練習方式清理文字，或是使用自己的方式"""

# Create Lemmatizer, stem the word
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J":wordnet.ADJ,
                "N":wordnet.NOUN,
                "V":wordnet.VERB,
                "R":wordnet.ADV}
    return tag_dict.get(tag,wordnet.NOUN)

def clean_content(X):
    # remove non-alphabet characters
    X_clean = [re.sub('[^a-zA-Z]',' ',x).lower() for x in X]
    # tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
    # list of X without stopword and after lemmatizer
    X_stopword_lemmatizer = []
    stop_words = set(stopwords.words('english'))
    for c in X_word_tokenize:
        c_clean = []
        for word in c:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
                c_clean.append(word)
        X_stopword_lemmatizer.append(c_clean)
    
    X_output = [' '.join(x) for x in X_stopword_lemmatizer]
    
    return X_output
                 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\03950\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
X = clean_content(X)

### Bag of words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 2000)
X=cv.fit_transform(X).toarray()

In [29]:
X.shape

(5572, 2000)

## Splitting the dataset into the Training set and Test set

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

n_neighbors = [3,5]

for k in n_neighbors:
    classifier = KNeighborsClassifier(n_jobs = -1,n_neighbors = k)
    accuracies = cross_val_score(estimator = classifier, 
                                 X = X_train, 
                                 y = y_train, 
                                 cv = 10, 
                                 n_jobs = -1)
    
    print(f'K:{k}')
    print(f'Average Acc: {accuracies.mean()}')
    print(f'Average std: {accuracies.std()}')


K:3
Average Acc: 0.9371758956013503
Average std: 0.005516966915019732
K:5
Average Acc: 0.9241648611880888
Average std: 0.008319957077179942


In [40]:
classifier = KNeighborsClassifier(n_jobs = -1,n_neighbors = 3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
                     weights='uniform')

## Predicting a new result

In [41]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9649988781691721


In [42]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9327354260089686


## Predicting the Test set results

In [43]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[949   0]
 [ 75  91]]


0.9327354260089686