In [1]:
import numpy as np
from tqdm import tqdm

import scipy as sp
from scipy import spatial
import matplotlib.pyplot as plt


import matplotlib.pyplot as plt # side-stepping mpl backend

from nltk.stem.lancaster import LancasterStemmer
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise
from sklearn.feature_selection import SelectPercentile, f_classif
from scipy.sparse import csr_matrix
import heapq
import string
import re

from nltk.corpus import stopwords
from collections import defaultdict
from collections import Counter
from nltk.tokenize import word_tokenize


vectorizer = CountVectorizer()
st = LancasterStemmer()

In [2]:
#open file and read the lines in to an array

with open("TestData/train.dat", "r") as fh:
    Train_lines = fh.readlines()
    
with open("TestData/test.dat", "r") as fh:
    Test_lines = fh.readlines()



In [3]:
# shuffle the training set 

import random
random.shuffle(Train_lines)
Data_lines = Train_lines + Test_lines

In [4]:
#fined train labels array

Train_Docs = [doc.split() for doc in Train_lines ]

labels =np.zeros((len(Train_Docs,)))
for i in range(len(Train_Docs)):
    labels[i] = Train_Docs[i][0]

In [5]:
#stopwords, punctuations and numbers remove for data
stops = set(stopwords.words('english'))
Data_lines_pp=[]
for line in Data_lines:
    newLine = []
    for w in line.split():
        if w.lower()not in stops:
            newLine.append(w.lower())     #for stopwords
            finalLine = " ".join(newLine)
            finalLine = finalLine.translate(str.maketrans('','',string.punctuation)) #for punctuation
            finalLine = re.sub("\d+", " ", finalLine)# for numbers
    Data_lines_pp.append(finalLine)

print ("\n after cleanup: ",Data_lines_pp[0])


 after cleanup:    forget it first time   month used cup stuck entire hand pulled really fast snack went flying note  maybe filled   way tried thinking shell realize did did pushes top everything falls out great concept big work maybe older kids may work


In [6]:
#tokenize the cleaned data

Docs_tokenized = [doc.split() for doc in Data_lines_pp]    #split has the same result with word_tokenize becuse the data alredi preprossesd and remove all the un nessesary things

In [7]:
#reduced words to the root

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

porter = PorterStemmer()
stem_sentence = []
for w in range (len(Docs_tokenized)):
    for word in range (len(Docs_tokenized[w])):
        Docs_tokenized[w][word]=porter.stem(Docs_tokenized[w][word])

print(Docs_tokenized[0])

['forget', 'it', 'first', 'time', 'month', 'use', 'cup', 'stuck', 'entir', 'hand', 'pull', 'realli', 'fast', 'snack', 'went', 'fli', 'note', 'mayb', 'fill', 'way', 'tri', 'think', 'shell', 'realiz', 'did', 'did', 'push', 'top', 'everyth', 'fall', 'out', 'great', 'concept', 'big', 'work', 'mayb', 'older', 'kid', 'may', 'work']


In [8]:
#minword is the minimum length of the word to keep

def filterwords(docs, minword):
    return [ [t for t in d if len(t) >= minword ] for d in docs ]
Docs_tokenized = filterwords(Docs_tokenized, 4)

In [9]:
# get a frequency count for all words in the Training docs

from tqdm import tqdm

def getWordset(doc):
    wordset_train = set()
    wordsCount_train = Counter()
    for d in range(len(doc)):
        for w in doc[d]:
            if w not in wordset_train :
                wordset_train.add(w)
                wordsCount_train[w] = 1
            else:
                wordsCount_train[w] += 1
    return wordsCount_train, wordset_train

wordsCount_train, wordset_train = getWordset(Docs_tokenized[0:18506])
wordsCount_test, wordset_test = getWordset(Docs_tokenized[18506:])

print("Number of words in train:" ,len(wordsCount_train), wordsCount_train )
print("Number of words in test:" , len(wordsCount_test), wordsCount_test )



In [10]:
#dimentionality reduction

def dimention_reduction(top_percent_train_word, wordsCount_train):

    top_percent_train_word = wordsCount_train.most_common(int(round(len(wordsCount_train)*top_percent_train_word)))
    wordset_train_common = set()
    for word in top_percent_train_word:
        wordset_train_common.add(word[0])
    return wordset_train_common

wordset_train_common = dimention_reduction( 0.4, wordsCount_train)
wordset_test_common = dimention_reduction( 0.4, wordsCount_test)
words = []
common_words = set()
common_words = set.intersection(wordset_train_common , wordset_test_common)
words = list(common_words)
numOfwocab=len(common_words)
print("Number of common words in both train and test:",len(common_words),words)

Number of common words in both train and test: 7079 ['infantino', 'kindli', 'advantag', 'anytim', 'brother', 'screen', 'trauma', 'about', 'someth', 'workupd', 'reserv', 'toddl', 'downhil', 'worn', 'wireless', 'unsturdi', 'glimps', 'ashley', 'remaind', 'thermomet', 'regulars', 'slant', 'vomit', 'breakabl', 'duffel', 'nosefrida', 'wing', 'outweigh', 'scalp', 'scent', 'disturb', 'exert', 'twoon', 'breastmilk', 'nightmar', 'foam', 'jail', 'smiley', 'styrofoam', 'inclus', 'aspect', 'grey', 'yell', 'alreadi', 'lieu', 'wors', 'messag', 'rapid', 'babbl', 'process', 'scrach', 'dine', 'commut', 'wayth', 'vote', 'sleeper', 'manuv', 'sheep', 'innoc', 'registri', 'vicin', 'pray', 'visitor', 'clutter', 'itwhat', 'ceram', 'drier', 'chart', 'cheep', 'destroy', 'tshirt', 'heartbeat', 'gentli', 'teether', 'advertis', 'reek', 'suit', 'wearer', 'painth', 'breastf', 'brainer', 'vent', 'hearth', 'document', 'rewrap', 'necklin', 'ensembl', 'deterg', 'umbil', 'backward', 'coverth', 'playmat', 'ineffect', 'twi

In [11]:
featureList = words
print(featureList)

['infantino', 'kindli', 'advantag', 'anytim', 'brother', 'screen', 'trauma', 'about', 'someth', 'workupd', 'reserv', 'toddl', 'downhil', 'worn', 'wireless', 'unsturdi', 'glimps', 'ashley', 'remaind', 'thermomet', 'regulars', 'slant', 'vomit', 'breakabl', 'duffel', 'nosefrida', 'wing', 'outweigh', 'scalp', 'scent', 'disturb', 'exert', 'twoon', 'breastmilk', 'nightmar', 'foam', 'jail', 'smiley', 'styrofoam', 'inclus', 'aspect', 'grey', 'yell', 'alreadi', 'lieu', 'wors', 'messag', 'rapid', 'babbl', 'process', 'scrach', 'dine', 'commut', 'wayth', 'vote', 'sleeper', 'manuv', 'sheep', 'innoc', 'registri', 'vicin', 'pray', 'visitor', 'clutter', 'itwhat', 'ceram', 'drier', 'chart', 'cheep', 'destroy', 'tshirt', 'heartbeat', 'gentli', 'teether', 'advertis', 'reek', 'suit', 'wearer', 'painth', 'breastf', 'brainer', 'vent', 'hearth', 'document', 'rewrap', 'necklin', 'ensembl', 'deterg', 'umbil', 'backward', 'coverth', 'playmat', 'ineffect', 'twin', 'squeaki', 'even', 'nicer', 'june', 'footi', 'gi

In [12]:
from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    dim = len(featureList)
    feature_set = set(featureList[:dim])
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        set_d = set(d)
        
        d = list(set.intersection(feature_set,set_d))
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        set_d = set(d)
        
        d = list(set.intersection(feature_set,set_d))
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [13]:
#print(linesOfTrainData[:1])
matOflinesOfTrainData  = build_matrix(Docs_tokenized)
csr_info(matOflinesOfTrainData)

#print (matOflinesOfTrainData[:1])

#print(linesOfTestData[:1])
#matOflinesOfTestData  = build_matrix(Docs_tokenized[18506:])
#csr_info(matOflinesOfTestData)
#print (matOflinesOfTestData[:1])

 [nrows 18506, ncols 7079, nnz 584521]
 [nrows 18506, ncols 7079, nnz 581300]


In [14]:
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

In [15]:
mat2linesOfTrainData = csr_idf(matOflinesOfTrainData, copy=True)
mat3linesOfTrainData = csr_l2normalize(mat2linesOfTrainData, copy=True)
print("mat1:", matOflinesOfTrainData[15,:20].todense(), "\n")
print("mat2:", mat2linesOfTrainData[15,:20].todense(), "\n")
print("mat3:", mat3linesOfTrainData[15,:20].todense())

mat1: [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]] 

mat2: [[0.         0.         0.         0.         0.         1.52209787
  0.         0.         0.         0.         0.         0.
  3.4474241  0.         0.         0.         0.         0.
  0.         0.        ]] 

mat3: [[0.         0.         0.         0.         0.         0.06931088
  0.         0.         0.         0.         0.         0.
  0.15698334 0.         0.         0.         0.         0.
  0.         0.        ]]


In [16]:
mat2linesOfTestData = csr_idf(matOflinesOfTestData, copy=True)
mat3linesOfTestData = csr_l2normalize(mat2linesOfTestData, copy=True)
print("mat1:", matOflinesOfTestData[15,:20].todense(), "\n")
print("mat2:", mat2linesOfTestData[15,:20].todense(), "\n")
print("mat3:", mat3linesOfTrainData.shape)



mat1: [[0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1.]] 

mat2: [[0.         0.         6.99263694 0.         0.         0.
  2.01306746 0.         2.46003745 0.         1.02754656 0.
  3.23967863 1.31025837 0.         2.6790781  0.         0.
  0.         2.56673415]] 

mat3: (18506, 7079)


In [17]:
linesOfTrainData_Transformed = mat3linesOfTrainData
linesOfTestData_Transformed = mat3linesOfTestData



In [18]:
print(Docs_tokenized[0:1]) 
print('-------')
print(linesOfTrainData_Transformed[:1])

[['forget', 'first', 'time', 'month', 'stuck', 'entir', 'hand', 'pull', 'realli', 'fast', 'snack', 'went', 'note', 'mayb', 'fill', 'think', 'shell', 'realiz', 'push', 'everyth', 'fall', 'great', 'concept', 'work', 'mayb', 'older', 'work']]
-------
  (0, 0)	0.17187132773259078
  (0, 1)	0.08569295831392164
  (0, 2)	0.08803094546632122
  (0, 3)	0.23123862747457385
  (0, 4)	0.13107249745684416
  (0, 5)	0.08952626896594512
  (0, 6)	0.0890765570316501
  (0, 7)	0.11451434394801159
  (0, 8)	0.2120530443464109
  (0, 9)	0.15684078382900046
  (0, 10)	0.19194598883484776
  (0, 11)	0.23221894487450612
  (0, 12)	0.20276949582707277
  (0, 13)	0.28685915382965677
  (0, 14)	0.25807851966833545
  (0, 15)	0.3030887790460169
  (0, 16)	0.2772584671644804
  (0, 17)	0.2699383610546127
  (0, 18)	0.22626953729060223
  (0, 19)	0.2268681957149304
  (0, 20)	0.18950507084190976
  (0, 21)	0.19006562481503295
  (0, 22)	0.1715767969097339
  (0, 23)	0.22350627028213133
  (0, 24)	0.09803505611180247


In [19]:
print(Docs_tokenized[18506:18507])
print('-------')
print(linesOfTestData_Transformed[:1])

[['perfect', 'parent', 'keep', 'track', 'babi', 'feed', 'sleep', 'diaper', 'chang', 'schedul', 'first', 'half', 'month', 'life', 'made', 'life', 'easier', 'doctor', 'would', 'question', 'habit', 'right', 'there']]
-------
  (0, 0)	0.14250471364188513
  (0, 1)	0.19250073305254006
  (0, 2)	0.39046189211509646
  (0, 3)	0.15313637485766463
  (0, 4)	0.39046189211509646
  (0, 5)	0.3173171849111358
  (0, 6)	0.11240768510131299
  (0, 7)	0.15008280843531605
  (0, 8)	0.1373660443257761
  (0, 9)	0.08466161400599932
  (0, 10)	0.05737717820280218
  (0, 11)	0.10978233820833934
  (0, 12)	0.18090043255434657
  (0, 13)	0.07316352441410422
  (0, 14)	0.1815979448149081
  (0, 15)	0.14959705680747434
  (0, 16)	0.3114339585033717
  (0, 17)	0.22485014114493354
  (0, 18)	0.1755434090941646
  (0, 19)	0.1433238824471274
  (0, 20)	0.31821062805874084
  (0, 21)	0.20840683188365344


In [41]:
#crate an obj of my KNN 

from models import KNN

knn = KNN(275)
knn.train(linesOfTrainData_Transformed.toarray(), labels)
knn.find_dist(linesOfTestData_Transformed.toarray())

array([[0.76554198, 0.15855099, 0.05833414, ..., 0.06159485, 0.020401  ,
        0.01164489],
       [0.03100177, 0.5078886 , 0.54219395, ..., 0.04824506, 0.03107563,
        0.0533304 ],
       [0.03293847, 0.01931979, 0.21655645, ..., 0.        , 0.        ,
        0.04065774],
       ...,
       [0.09820264, 0.06842941, 0.        , ..., 0.        , 0.        ,
        0.02209947],
       [0.04085774, 0.04922328, 0.03550821, ..., 0.        , 0.00674966,
        0.04303199],
       [0.04025063, 0.0760208 , 0.05531158, ..., 0.        , 0.01386395,
        0.01697461]])

In [42]:
#predict the labels
final_labels = knn.predict(linesOfTrainData_Transformed.toarray())

In [43]:
#write on file

f = open('TestData/format.dat', 'w')
count = 0
print(len(final_labels),final_labels[-20:] )
for index in range(len(final_labels)):
    
    if  final_labels[index] == 1 :
        f.write('+1\n')
        count+=1
    else:
        f.write('-1\n')
        count+=1

print("count : ",count)
print("--The End--")

18506 [ 1.  1. -1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1. -1.  1.  1.
  1.  1.]
count :  18506
--The End--


In [44]:
with open('TestData/format.dat', 'r') as fhr:
    d = fhr.readlines()
print(len(d))

18506


In [30]:
import sklearn.model_selection as model_selection
def randomchek(linesOfTrainData_Transformed, labels):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(linesOfTrainData_Transformed, labels, train_size=0.65,test_size=0.35, random_state=0)
    return X_train, X_test, y_train, y_test

In [31]:
# cross validation k=5


from scipy.sparse import coo_matrix, vstack
from sklearn.model_selection import train_test_split

G=linesOfTrainData_Transformed.toarray()
H=labels

X_test=np.zeros((5,3701,numOfwocab))
y_test=np.zeros((5,3701,))
X_train=np.zeros((5,14805,numOfwocab))
y_train=np.zeros((5,14805,))

start=0
stop=3701
for i in range(5):
    X_test[i], y_test[i] = G[start:stop],H[start:stop]
    X_train[i] ,y_train[i] = np.delete(G,np.arange(start,stop),0),np.delete(H,np.arange(start,stop))
    start+=3701
    stop+=3701

In [32]:
def get_acc(pred, y_test):
    return np.sum(y_test==pred)/len(y_test)*100

In [45]:
for i in tqdm(range(5)):
    knn = KNN(501)
    knn.train(X_train[i], y_train[i])
    knn.find_dist(X_test[i])
    pred = knn.predict(X_test[i])
    print(get_acc(pred, y_test[i]))


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:14<00:57, 14.35s/it][A

78.00594433936774



 40%|████      | 2/5 [00:28<00:42, 14.27s/it][A

77.9519048905701



 60%|██████    | 3/5 [00:43<00:29, 14.54s/it][A

79.27587138611186



 80%|████████  | 4/5 [00:58<00:14, 14.61s/it][A

77.46554985139151



100%|██████████| 5/5 [01:14<00:00, 14.83s/it][A

78.8975952445285



