In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from random import seed
from random import randrange
from math import exp
from math import log
from math import floor

In [None]:
#split data into train and test
def cross_val_split(data_X,data_Y,test_size,seed_val):
	data_x = data_X.tolist()
	data_y = data_Y.tolist()
	seed(seed_val)
	train_size = floor((1 - test_size)*len(data_x))
	train_x = []
	train_y = []
	while(len(train_x)<train_size):
		index = randrange(len(data_x))
		train_x.append(data_x.pop(index))
		train_y.append(data_y.pop(index))
	return train_x,train_y,data_x,data_y

#columnwise max-min statistics for scaling
def statistics(x):
	cols = list(zip(*x))
	stats = []
	for e in cols:
		stats.append([min(e),max(e)])
	return stats

#scale the features
def scale(x, stat):
	for row in x:
		for i in range(len(row)):
			row[i] = (row[i] - stat[i][0])/(stat[i][1] - stat[i][0])
  
#convert different classes into different columns to implement one v/s all
def one_vs_all_cols(s):
	m = list(set(s))
	m.sort()
	for i in range(len(s)):
		new = [0]*len(m)
		new[m.index(s[i])] = 1
		s[i] = new
	return m

#Theta transpose x Feature Vector
def ThetaTX(Q,X):
	det = 0.0
	for i in range(len(Q)):
		det += X[i]*Q[i]
	return det

#cost for negative class (classs = 0)
def LinearSVM_cost0(z):
	if(z < -1): #Ensuring margin
		return 0
	return z + 1

#cost for positive class (classs = 1)
def LinearSVM_cost1(z):
	if(z > 1): #Ensuring margin
		return 0
	return -z + 1

def sigmoid(z):
	return 1.0/(1.0 + exp(-z))

#SVM cost
def cost(theta,c,x,y):
	cost = 0.0
	for i in range(len(x)):
		z = ThetaTX(theta[c], x[i])
		cost += y[i]*LinearSVM_cost1(z) + (1 - y[i])*LinearSVM_cost0(z)
	return cost

#Gradient Descent on the weights/parameters
def gradDescent(theta, c, x, y, learning_rate):
	oldTheta = theta[c]
	for Q in range(len(theta[c])):
		derivative_sum = 0 
		for i in range(len(x)):
			derivative_sum += (sigmoid(ThetaTX(oldTheta, x[i])) - y[i])*x[i][Q]
		theta[c][Q] -= learning_rate*derivative_sum

#predictions using trained weights
def predict(data, theta):
	predictions = []
	count = 1
	for row in data:
		hypothesis = []
		multiclass_ans = [0]*len(theta)
		for c in range(len(theta)):
			z = ThetaTX(row,theta[c])
			hypothesis.append(sigmoid(z))
		index = hypothesis.index(max(hypothesis))
		multiclass_ans[index] = 1
		predictions.append(multiclass_ans)
		count+=1
	return predictions

def accuracy(predicted, actual):
	n = len(predicted)
	correct = 0
	for i in range(n):
		if(predicted[i]==actual[i]):
			correct+=1
	return correct/n

def cross_validation(x, y, test_data_size, validations, learning_rate, epoch):
	print("Epochs count: ",epoch)
	accuracies = []
	for valid in range(validations):
		x_train, y_train, x_test, y_test = cross_val_split(x,y,
                                                     test_data_size,valid+1)
		#converting y_train to classwise columns with 0/1 values
		classes = []
		for i in range(len(label_map)):
			classes.append([row[i] for row in y_train])
		#Initialising Theta (Weights)
		theta = [[0]*len(x_train[0]) for _ in range(len(classes))]
		#training model
		for i in range(epoch):
			for class_type in range(len(classes)):
				gradDescent(theta,class_type,x_train,classes[class_type],learning_rate)
		#Predicting using test data
		y_pred = predict(x_test, theta)
		#Calculating accuracy
		accuracies.append(accuracy(y_pred,y_test))
		print("Validation", valid+1, "accuracy: ", accuracies[valid])
	return sum(accuracies)/len(accuracies), y_pred

In [None]:
dataset = pd.read_csv("data.csv")
data = dataset.values

x = data[:, 2:-1]
y = data[:, 1]
print(y[:10], x[:2])

stats = statistics(x)
scale(x,stats)
label_map = one_vs_all_cols(y)
print(label_map)

['M' 'buf' 'M' 'M' 'M' 'M' 'M' 'M' 'M' 'M'] [[17.99 10.38 122.8 1001.0 0.1184 0.2776 0.3001 0.1471 0.2419 0.07871
  1.095 0.9053 8.589 153.4 0.006399 0.04904 0.05372999999999999 0.01587
  0.03003 0.006193 25.38 17.33 184.6 2019.0 0.1622 0.6656 0.7119 0.2654
  0.4601 0.1189]
 [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]]
['B', 'M', 'buf']


In [None]:
#Splitting dataset into training and testing data
test_data_size = 0.2
learning_rate = 0.02
epoch = 100
validations = 5

final_score,y_pred = cross_validation(x,y,test_data_size,validations,
                                      learning_rate,epoch)

Epochs count:  100
Validation 1 accuracy:  0.9385964912280702
Validation 2 accuracy:  0.956140350877193
Validation 3 accuracy:  0.9298245614035088
Validation 4 accuracy:  0.9385964912280702
Validation 5 accuracy:  0.9473684210526315


In [None]:
#Printing Final Stats
print("Learning rate: ", learning_rate)
print("Iterations: ",epoch)
print("Training data size: ", floor(len(x)*(1 - test_data_size)))
print("Test data size: ", len(x) - floor(len(x)*(1 - test_data_size)))
print("Accuracy: ",final_score*100,"%")

Learning rate:  0.02
Iterations:  100
Training data size:  456
Test data size:  114
Accuracy:  94.2105263157895 %




---

---



---

---









In [2]:
import pandas as pd
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/SVM/reuters.xlsx')
X, y = data.text, data.topic

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

lem_texts = []
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for t in X:
    lem = [wnl.lemmatize(word) for word in str(t).split()]
    lem_texts.append(' '.join(lem))

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, min_df=10, max_df=0.8,
                     stop_words=nltk.corpus.stopwords.words('english'))
#max_features - к-сть слів, які використовуються для класифікації
#min_df - мін к-сть текстів, у яких міститься слово
#max_df - макс відсоток файлів, у яких міститься слово
#stop_words - шумові слова
X = cv.fit_transform(lem_texts).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

print(f'Topics: {len(set(y))}, X_shape: {X.shape}')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Topics: 82, X_shape: (10717, 2000)


In [3]:
import numpy as np
y = np.array(y)

In [4]:
from sklearn.model_selection import train_test_split
print(y[:10], X[:2])
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                          test_size=0.2, random_state=42)

['earn' 'trade' 'earn' 'crude' 'coffee' 'vegoil' 'acq' 'earn' 'acq' 'earn'] [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.07690873 ... 0.         0.         0.        ]]


In [5]:
from sklearn import model_selection
from sklearn.svm import SVC
model = SVC(kernel='linear')

kfold = model_selection.KFold(n_splits=5)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold,
                                             scoring='accuracy')

print(f'SVM {cv_results.mean()} {cv_results.std()}')

SVM 0.867259169045181 0.00846497638275997


In [6]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test,predictions))

0.8703358208955224
                precision    recall  f1-score   support

           acq       0.85      0.97      0.91       464
          alum       1.00      0.45      0.62        11
           bop       1.00      0.12      0.22        16
       carcass       0.50      0.40      0.44         5
         cocoa       0.82      0.90      0.86        10
        coffee       1.00      0.76      0.86        21
        copper       0.67      1.00      0.80        12
        cotton       1.00      1.00      1.00         6
           cpi       0.79      0.73      0.76        15
           cpu       0.00      0.00      0.00         2
         crude       0.77      0.86      0.81        99
           dlr       0.00      0.00      0.00         7
          earn       0.98      0.94      0.96       778
          fuel       1.00      0.50      0.67         2
           gas       1.00      0.33      0.50         6
           gnp       0.72      0.90      0.80        20
          gold       0.85   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
