In [1]:
import numpy as np
from sklearn import svm
from sklearn import preprocessing

In [2]:
# kernel - 'linear', 'rbf'
# svm - support vector machine
# C - penalty error


np.__version__

'1.22.2'

In [3]:
def accuracy_score(true_labels, predicted_labels):
    return (true_labels == predicted_labels).mean()

def normalize_data(train_data, test_data, type= None):
    if type == 'standard':
        scaler = preprocessing.StandardScaler()
    elif type == 'l1':
        scaler = preprocessing.Normalizer(norm = 'l1')
    elif type == 'l2':
        scaler = preprocessing.Normalizer(norm = 'l2')
    
    if type is None:
        return train_data, test_data
    else:
        scaler.fit(train_data)
        scaler_train = scaler.transform(train_data)
        scaler_test = scaler.transform(test_data)
        return scaler_train, scaler_test

In [4]:
train_data = np.load('data/training_sentences.npy', allow_pickle = True)
train_labels = np.load('data/training_labels.npy', allow_pickle = True)
test_data = np.load('data/test_sentences.npy', allow_pickle = True)
test_labels = np.load('data/test_labels.npy', allow_pickle = True)

print(train_labels)
print(train_data)

[0 0 1 ... 0 0 0]
[list(['Probably', 'not', 'still', 'going', 'over', 'some', 'stuff', 'here'])
 list(['I', 'HAVE', 'A', 'DATE', 'ON', 'SUNDAY', 'WITH', 'WILL'])
 list(['Thanks', '4', 'your', 'continued', 'support', 'Your', 'question', 'this', 'week', 'will', 'enter', 'u', 'in2', 'our', 'draw', '4', 'Â£100', 'cash', 'Name', 'the', 'NEW', 'US', 'President', 'txt', 'ans', 'to', '80082'])
 ...
 list(['OH', 'FUCK', 'JUSWOKE', 'UP', 'IN', 'A', 'BED', 'ON', 'A', 'BOATIN', 'THE', 'DOCKS', 'SLEPT', 'WID', '25', 'YEAR', 'OLD', 'SPINOUT', 'GIV', 'U', 'DA', 'GOSSIP', 'L8R', 'XXX'])
 list(['NOT', 'MUCH', 'NO', 'FIGHTS', 'IT', 'WAS', 'A', 'GOOD', 'NITE'])
 list(['Did', 'he', 'just', 'say', 'somebody', 'is', 'named', 'tampa'])]


In [5]:
class BagOfWords:
    def __init__(self):
        self.voc = dict()
    
    def get_voc(self, train_data):
        for doc in train_data:
            for word in doc:
                if word not in self.voc:
                    self.voc[word] = len(self.voc)
    
    def get_features(self, data):
        features = np.zeros((len(data), len(self.voc)))
        for i, doc in enumerate(data):
            for word in doc:
                if word in self.voc:
                    features[i, self.voc[word]] += 1
        return features
                    
        

bow = BagOfWords()
bow.get_voc(train_data) 
                            
len(bow.voc)

9522

In [6]:
x_train = bow.get_features(train_data)
x_test = bow.get_features(test_data)

print(x_train.shape, x_test.shape)

(3734, 9522) (1840, 9522)


In [7]:
scaled_train, scaled_test = normalize_data(x_train, x_test, type = 'l2')

print(scaled_train, scaled_test, sep = '\n\n\n\n')

[[0.35355339 0.35355339 0.35355339 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.19611614 0.19611614 0.        ]
 [0.         0.         0.         ... 0.         0.         0.33333333]
 [0.         0.         0.         ... 0.         0.         0.        ]]



[[0.  0.  0.5 ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [8]:
svm_classifier = svm.SVC(C = 1, kernel = 'linear')

svm_classifier.fit(scaled_train, train_labels)

predicted = svm_classifier.predict(scaled_test)

acc = accuracy_score(test_labels, predicted)

print(acc)

0.9842391304347826
