In [50]:
import numpy as np
import random
import time

from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# from SVClassifier import SVClassifier

In [109]:
class LinearClassifier(BaseEstimator):
    
    def decision_function(self, X):
        return X.dot(self.w)
    
    def predict(self, X):
        scores = self.decision_function(X)
        print(scores)
        out = np.select([scores >= 0.0 , scores < 0.0], [self.positive_class, self.negative_class])
        return out
    
    def find_classes(self, Y):
        classes = sorted(set(Y))
        if len(classes) != 2:
            raise Exception("This is not a binary classification problem")
        
        self.negative_class = classes[0]
        self.positive_class = classes[1]
         
    
    def encode_output(self, Y):
        encoded = np.array([1 if y==self.positive_class else -1 for y in Y])

        return encoded

class SVClassifier(LinearClassifier):
    
    def __init__(self, n_iter=50):
        self.n_iter = n_iter
        
    def fit(self, X, Y, regularization_param):
        """
        Train a linear classifier using the SVC learning algorithm.
        """
        self.find_classes(Y)

        Ye = self.encode_output(Y)

        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()

        # Initialize the weight vector to all zeros.
        n_features = X.shape[1]
        self.w = np.zeros(n_features)

        # start iterations
        t = 0
        for i in range(self.n_iter):

            for x_i, y_i in zip(X, Ye):
                t += 1

                # Calculate steplength
                eta = 1 / (regularization_param * t)
                # Calculate score
                score = x_i.dot(self.w)

                if y_i * score < 1.0:
                    self.w = ((1 - eta * regularization_param) * self.w) + ((eta * y_i) * x_i)
                else:
                    self.w = (1 - eta * regularization_param) * self.w    


class LogisticRegressionClassifier(LinearClassifier):
    def __init__(self, n_iter=50):
        self.n_iter = n_iter
        
    def fit(self, X, Y, regularization_param):
        self.find_classes(Y)
        
        Y_encoded = self.encode_output(Y)
        
        # If necessary, convert the sparse matrix returned by a vectorizer
        # into a normal NumPy matrix.
        if not isinstance(X, np.ndarray):
            X = X.toarray()
            
        n_features = X.shape[1]
        self.w = np.zeros(n_features)
        
        #Begin iterations
        t = 0
        for i in range(self.n_iter):
            for x_i, y_i in zip(X,Y_encoded):
                t += 1
                eta = 1 / (t * regularization_param)
                z = x_i.dot(self.w)
                self.w = ((1 - eta * regularization_param) * self.w) + eta * (y_i / (1 + np.exp(y_i * z)) * x_i) 
                    

In [110]:
def read_data(corpus_file):
    X = []
    Y = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            _, y, _, x = line.split(maxsplit=3)
            X.append(x.strip())
            Y.append(y)
    return X, Y

t4 = time.time()
# Read all the documents.
X, Y = read_data('pa2b/data/all_sentiment_shuffled.txt')

# Split into training and test parts.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

# Set up the preprocessing steps and the classifier.
pipeline = make_pipeline(
    TfidfVectorizer(),
    SelectKBest(k=1000),
    Normalizer(),
    LogisticRegressionClassifier(n_iter=50)
)

#Train the classifier (adjust weights) and time it
t0 = time.time()
pipeline.fit(Xtrain, Ytrain, logisticregressionclassifier__regularization_param=1/len(Xtrain))
t1 = time.time()

#Evaluate on the test set
t2 = time.time()
Yguess = pipeline.predict(Xtest)
t3 = time.time()
t5 = time.time()

print('Training duration: {:.4f} seconds.'.format(t1 - t0))
print('Prediction duration: {:.4f} seconds.'.format(t3 - t2))
print('Program duration: {:.4f} seconds.\n'.format(t5 - t4))
print('Accuracy: {:.4f}.'.format(accuracy_score(Ytest, Yguess)))

[-0.80513534  0.26354307  0.78785032 ... -0.73691132 -0.00379889
 -1.73332649]
Training duration: 10.8021 seconds.
Prediction duration: 0.7879 seconds.
Program duration: 11.6728 seconds.

Accuracy: 0.8363.


In [81]:
a = [1,2,3]
b = [9,8,7]
taps = list(zip(a,b))
random.shuffle(taps)
taps

[(3, 7), (1, 9), (2, 8)]