# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [130]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [131]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')
dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [132]:
dataset = dataset[["v1", "v2"]] #remove unnecesary and empty columns 
dataset.columns = ["ham_or_spam", "text"] #rename columns
dataset #view dataset

Unnamed: 0,ham_or_spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [133]:
all_data = []
for items in dataset.itertuples():
  #print(items[1])
  #print(items[2])
  text = items[2]
  if items[1] == "ham":
    spam = 0
  else:
    spam = 1
  all_data.append([text, spam])

In [134]:
all_data = np.array(all_data)

### 取出訓練內文與標註

In [135]:
X = all_data[:,0]
Y = all_data[:,1].astype(np.uint8)

In [136]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]


In [137]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [138]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("punkt")
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [139]:
lem = WordNetLemmatizer()

def get_wordnet_pos(word):
  #print(nltk.pos_tag([word])) output format "nice" = [('nice', 'JJ')])
  tag = nltk.pos_tag([word])[0][1][0].upper
  #[0][1][0] = first character in the POS string
  tag_dict = {
      "J":wordnet.ADJ,
      "N":wordnet.NOUN,
      "V":wordnet.VERB,
      "R":wordnet.ADV
  }
  return tag_dict.get(tag, wordnet.NOUN)

def clean_content(content):
  content_clean = [re.sub("[^a-zA-Z]", " ", x).lower() for x in content] #remove non alphabetic letters
  content_tokenize = [nltk.word_tokenize(words) for words in content_clean]  #tokenize
  content_stopword_lemmatizer = []
  stop_words = set(stopwords.words("english"))
  for word in content_tokenize:
    word_clean = []
    for w in word:
      if w not in stop_words:
        w = lem.lemmatize(w, get_wordnet_pos(w))
        word_clean.append(w)
    content_stopword_lemmatizer.append(word_clean)
  output = [' '.join(x) for x in content_stopword_lemmatizer]
  return output

In [140]:
X = clean_content(X)

### Bag of words

In [149]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 2000)
X=cv.fit_transform(X).toarray()

AttributeError: ignored

In [142]:
X.shape

(5572, 2000)

## Splitting the dataset into the Training set and Test set

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [144]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Predicting a new result

In [145]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9409916984518735


In [146]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9139013452914798


## Predicting the Test set results

In [147]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [148]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[949   0]
 [ 96  70]]


0.9139013452914798

##### oh don't mind me i'm just playing with some parameters here 

In [155]:
features = [500, 1000, 1500, 2000, 2500] #trying if different features would impact training scores
for num_feat in features:
  dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')
  dataset = dataset[["v1", "v2"]] #remove unnecesary and empty columns 
  dataset.columns = ["ham_or_spam", "text"] #rename columns
  all_data = []
  for items in dataset.itertuples():
    text = items[2]
    if items[1] == "ham":
      spam = 0
    else:
      spam = 1
    all_data.append([text, spam])
  all_data = np.array(all_data)
  X = all_data[:,0]
  Y = all_data[:,1].astype(np.uint8)
  X = clean_content(X)
  print("now trying {} features".format(num_feat))
  cv=CountVectorizer(num_feat)
  X=cv.fit_transform(X).toarray()
  X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
  classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  classifier.fit(X_train, y_train)
  print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))
  print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))
  y_pred = classifier.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)
  print(cm)
  accuracy_score(y_test, y_pred)
  # *surprised pikachu face*

now trying 500 features
Trainset Accuracy: 0.9365043751402289
Testset Accuracy: 0.9085201793721973
[[949   0]
 [102  64]]
now trying 1000 features
Trainset Accuracy: 0.9365043751402289
Testset Accuracy: 0.9085201793721973
[[949   0]
 [102  64]]
now trying 1500 features
Trainset Accuracy: 0.9365043751402289
Testset Accuracy: 0.9085201793721973
[[949   0]
 [102  64]]
now trying 2000 features
Trainset Accuracy: 0.9365043751402289
Testset Accuracy: 0.9085201793721973
[[949   0]
 [102  64]]
now trying 2500 features
Trainset Accuracy: 0.9365043751402289
Testset Accuracy: 0.9085201793721973
[[949   0]
 [102  64]]


In [159]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [5, 10, 25, 50, 100, 200, 500, 1000] ## 可自行嘗試不同K值
for k in n_neighbors:
	classifier = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
	# cv = 10 代表切成10等分
	accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10,n_jobs=-1)
	
	print('設置K值:{}'.format(k))
	print('Average Accuracy: {}'.format(accuracies.mean()))
	print('Accuracy STD: {}'.format(accuracies.std()))

設置K值:5
Average Accuracy: 0.919002871970575
Accuracy STD: 0.0072759624394765415
設置K值:10
Average Accuracy: 0.8907346198417898
Accuracy STD: 0.006333371713162194
設置K值:25
Average Accuracy: 0.8712132816042727
Accuracy STD: 0.0020847926038850264
設置K值:50
Average Accuracy: 0.8696432710233285
Accuracy STD: 0.0006565610639949747
設置K值:100
Average Accuracy: 0.8696432710233285
Accuracy STD: 0.0006565610639949747
設置K值:200
Average Accuracy: 0.8696432710233285
Accuracy STD: 0.0006565610639949747
設置K值:500
Average Accuracy: 0.8696432710233285
Accuracy STD: 0.0006565610639949747
設置K值:1000
Average Accuracy: 0.8696432710233285
Accuracy STD: 0.0006565610639949747
