In [9]:
import pandas as pd
from random import seed
from random import randrange
from math import exp
from math import log
from math import floor

In [10]:
import pandas as pd
data = pd.read_excel('reuters.xlsx')
X, y = data.text, data.topic

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

lem_texts = []
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for t in X:
    lem = [wnl.lemmatize(word) for word in str(t).split()]
    lem_texts.append(' '.join(lem))

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, min_df=10, max_df=0.8,
                     stop_words=nltk.corpus.stopwords.words('english'))
#max_features - к-сть слів, які використовуються для класифікації
#min_df - мін к-сть текстів, у яких міститься слово
#max_df - макс відсоток файлів, у яких міститься слово
#stop_words - шумові слова
X = cv.fit_transform(lem_texts).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

print(f'Topics: {len(set(y))}, X_shape: {X.shape}')

  warn("Workbook contains no default style, apply openpyxl's default")
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maksy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maksy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topics: 82, X_shape: (10717, 2000)


In [11]:
import numpy as np
y = np.array(y)

In [12]:
#split data into train and test
def cross_val_split(data_X,data_Y,test_size,seed_val):
	data_x = data_X.tolist()
	data_y = data_Y.tolist()
	seed(seed_val)
	train_size = floor((1 - test_size)*len(data_x))
	train_x = []
	train_y = []
	while(len(train_x)<train_size):
		index = randrange(len(data_x))
		train_x.append(data_x.pop(index))
		train_y.append(data_y.pop(index))
	return train_x,train_y,data_x,data_y

#columnwise max-min statistics for scaling
def statistics(x):
	cols = list(zip(*x))
	stats = []
	for e in cols:
		stats.append([min(e),max(e)])
	return stats

#scale the features
def scale(x, stat):
	for row in x:
		for i in range(len(row)):
			row[i] = (row[i] - stat[i][0])/(stat[i][1] - stat[i][0])
  
#convert different classes into different columns to implement one v/s all
def one_vs_all_cols(s):
	m = list(set(s))
	m.sort()
	for i in range(len(s)):
		new = [0]*len(m)
		new[m.index(s[i])] = 1
		s[i] = new
	return m

#Theta transpose x Feature Vector
def ThetaTX(Q,X):
	det = 0.0
	for i in range(len(Q)):
		det += X[i]*Q[i]
	return det

#cost for negative class (classs = 0)
def LinearSVM_cost0(z):
	if(z < -1): #Ensuring margin
		return 0
	return z + 1

#cost for positive class (classs = 1)
def LinearSVM_cost1(z):
	if(z > 1): #Ensuring margin
		return 0
	return -z + 1

def sigmoid(z):
	return 1.0/(1.0 + exp(-z))

#SVM cost
def cost(theta,c,x,y):
	cost = 0.0
	for i in range(len(x)):
		z = ThetaTX(theta[c], x[i])
		cost += y[i]*LinearSVM_cost1(z) + (1 - y[i])*LinearSVM_cost0(z)
	return cost

#Gradient Descent on the weights/parameters
def gradDescent(theta, c, x, y, learning_rate):
	oldTheta = theta[c]
	for Q in range(len(theta[c])):
		derivative_sum = 0 
		for i in range(len(x)):
			derivative_sum += (sigmoid(ThetaTX(oldTheta, x[i])) - y[i])*x[i][Q]
		theta[c][Q] -= learning_rate*derivative_sum

#predictions using trained weights
def predict(data, theta):
	predictions = []
	count = 1
	for row in data:
		hypothesis = []
		multiclass_ans = [0]*len(theta)
		for c in range(len(theta)):
			z = ThetaTX(row,theta[c])
			hypothesis.append(sigmoid(z))
		index = hypothesis.index(max(hypothesis))
		multiclass_ans[index] = 1
		predictions.append(multiclass_ans)
		count+=1
	return predictions

def accuracy(predicted, actual):
	n = len(predicted)
	correct = 0
	for i in range(n):
		if(predicted[i]==actual[i]):
			correct+=1
	return correct/n

def cross_validation(x, y, x_res, y_res, test_data_size, validations, learning_rate, epoch):
	print("Epochs count: ",epoch)
	accuracies = []
	for valid in range(validations):
		x_train, y_train, x_val, y_val = cross_val_split(x,y,
                                                     test_data_size,valid+1)
		#converting y_train to classwise columns with 0/1 values
		classes = []
		for i in range(len(label_map)):
			classes.append([row[i] for row in y_train])
		#Initialising Theta (Weights)
		theta = [[0]*len(x_train[0]) for _ in range(len(classes))]
		#training model
		for i in range(epoch):
			print('.', end='')
			for class_type in range(len(classes)):
				print(',', end='')
				gradDescent(theta,class_type,x_train,classes[class_type],learning_rate)
		#Predicting using validation data
		y_pred = predict(x_val, theta)
		#Calculating accuracy
		accuracies.append(accuracy(y_pred,y_val))
		print("Validation", valid+1, "accuracy: ", accuracies[valid])
		y_pred = predict(x_res, theta)
		print("Test data accuracy: ", accuracy(y_pred,y_res))
	
	return sum(accuracies)/len(accuracies), y_pred


print(y[:10], X[:2])

stats = statistics(X)
scale(X,stats)
label_map = one_vs_all_cols(y)
print(label_map)
#Splitting dataset into training and testing data
test_data_size = 0.2
learning_rate = 0.02
epoch = 50
validations = 5

from sklearn.model_selection import train_test_split
print(y[:2], X[:2])
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                          test_size=0.2, random_state=42)
final_score,y_pred = cross_validation(X_train,y_train,X_test,y_test,test_data_size,validations,
                                      learning_rate,epoch)

['earn' 'trade' 'earn' 'crude' 'coffee' 'vegoil' 'acq' 'earn' 'acq' 'earn'] [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.07690873 ... 0.         0.         0.        ]]
['acq', 'alum', 'austdlr', 'barley', 'bop', 'carcass', 'cocoa', 'coconut', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'cpu', 'crude', 'cruzado', 'dlr', 'earn', 'fcattle', 'fishmeal', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'heat', 'hk', 'hog', 'housing', 'income', 'instaldebt', 'interest', 'inventories', 'ipi', 'ironsteel', 'jet', 'jobs', 'lcattle', 'lead', 'lei', 'livestock', 'lumber', 'mealfeed', 'moneyfx', 'moneysupply', 'naphtha', 'natgas', 'nickel', 'nzdlr', 'oilseed', 'orange', 'palmoil', 'petchem', 'platinum', 'plywood', 'potato', 'propane', 'rand', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'saudriyal', 'ship', 'silver', 'soybean', 'soymeal', 'stg', 'strategicmetal', 'sugar', 'tapioca', 'tea', 'tin', 'trade', 'vegoil', 'wheat', 'wool', 

KeyboardInterrupt: 

In [32]:
print(y[:10], X[:2])

stats = statistics(X)
scale(X,stats)
label_map = one_vs_all_cols(y)
print(label_map)

['earn' 'trade' 'earn' 'crude' 'coffee' 'vegoil' 'acq' 'earn' 'acq' 'earn'] [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.07690873 ... 0.         0.         0.        ]]
['acq', 'alum', 'austdlr', 'barley', 'bop', 'carcass', 'cocoa', 'coconut', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'cpu', 'crude', 'cruzado', 'dlr', 'earn', 'fcattle', 'fishmeal', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'heat', 'hk', 'hog', 'housing', 'income', 'instaldebt', 'interest', 'inventories', 'ipi', 'ironsteel', 'jet', 'jobs', 'lcattle', 'lead', 'lei', 'livestock', 'lumber', 'mealfeed', 'moneyfx', 'moneysupply', 'naphtha', 'natgas', 'nickel', 'nzdlr', 'oilseed', 'orange', 'palmoil', 'petchem', 'platinum', 'plywood', 'potato', 'propane', 'rand', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'saudriyal', 'ship', 'silver', 'soybean', 'soymeal', 'stg', 'strategicmetal', 'sugar', 'tapioca', 'tea', 'tin', 'trade', 'vegoil', 'wheat', 'wool', 

In [None]:
#Splitting dataset into training and testing data
test_data_size = 0.2
learning_rate = 0.02
epoch = 50
validations = 5

final_score,y_pred = cross_validation(X_train,y_train,X_test,y_test,test_data_size,validations,
                                      learning_rate,epoch)

In [None]:
#Printing Final Stats
print("Learning rate: ", learning_rate)
print("Iterations: ",epoch)
print("Training data size: ", floor(len(X)*(1 - test_data_size)))
print("Test data size: ", len(X) - floor(len(X)*(1 - test_data_size)))
print("Accuracy: ",final_score*100,"%")

Learning rate:  0.02
Iterations:  100
Training data size:  456
Test data size:  114
Accuracy:  94.2105263157895 %




---

---



---

---









In [3]:
import pandas as pd
data = pd.read_excel('reuters.xlsx')
X, y = data.text, data.topic

import nltk
nltk.download('wordnet')
nltk.download('stopwords')

lem_texts = []
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for t in X:
    lem = [wnl.lemmatize(word) for word in str(t).split()]
    lem_texts.append(' '.join(lem))

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, min_df=10, max_df=0.8,
                     stop_words=nltk.corpus.stopwords.words('english'))
#max_features - к-сть слів, які використовуються для класифікації
#min_df - мін к-сть текстів, у яких міститься слово
#max_df - макс відсоток файлів, у яких міститься слово
#stop_words - шумові слова
X = cv.fit_transform(lem_texts).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

print(f'Topics: {len(set(y))}, X_shape: {X.shape}')

  warn("Workbook contains no default style, apply openpyxl's default")
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maksy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maksy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Topics: 82, X_shape: (10717, 2000)


In [5]:
import numpy as np
y = np.array(y)

In [8]:
from sklearn.model_selection import train_test_split
print(y[:10], X[:2])
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                          test_size=0.2, random_state=42)

['earn' 'trade' 'earn' 'crude' 'coffee' 'vegoil' 'acq' 'earn' 'acq' 'earn'] [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.07690873 ... 0.         0.         0.        ]]


In [14]:
from sklearn import model_selection
from sklearn.svm import SVC
model = SVC(kernel='linear')

kfold = model_selection.KFold(n_splits=10)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold,
                                             scoring='accuracy')
msg = "%s: %f (%f)" % ('SVM', cv_results.mean(), cv_results.std())
print(msg)

SVM: 0.867490 (0.015561)


In [16]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test,predictions))

0.8782649253731343
                precision    recall  f1-score   support

           acq       0.86      0.97      0.91       459
          alum       0.90      0.69      0.78        13
           bop       1.00      0.36      0.53        11
       carcass       0.50      0.33      0.40         6
         cocoa       1.00      0.88      0.94        17
       coconut       0.00      0.00      0.00         1
        coffee       0.96      1.00      0.98        24
        copper       0.88      0.93      0.90        15
          corn       0.00      0.00      0.00         3
        cotton       0.50      0.50      0.50         2
           cpi       0.76      0.72      0.74        18
           cpu       0.00      0.00      0.00         1
         crude       0.90      0.84      0.87        95
           dlr       0.00      0.00      0.00         5
          earn       0.97      0.96      0.97       769
       fcattle       0.00      0.00      0.00         2
          fuel       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
