# Classification

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
vect = CountVectorizer()  # tokenization and feature extraction
vect.fit(x_train)
X_train = vect.transform(x_train)
X_test =vect.transform(x_test)

**Training, testing and evaluating a classifier**

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Create an object of the class *Perceptron*
clf = Perceptron()
# Learn the model by applying the function *fit()*
clf.fit( X_train, y_train )
# Predict the labels of the test instances
y_pred = clf.predict( X_test )
# Print the gold and predicted labels
print( "y true:", y_test )
print( "y pred:", y_pred )
# Print the accuracy
print( "Acc:", accuracy_score(y_test, y_pred ) )
# Print the classification report
print(classification_report(y_test, y_pred ))
# Print the confusion matrix
print( confusion_matrix(y_test, y_pred ) )

**Examining the model**

In [None]:
# Save the vocabulary into a variable
vocab = my_vectorizer.vocabulary_
print( "Vocabulary size:", len(vocab) )


ix_to_tokens = { v:k for k,v in vocab.items() }

# Save the weights in a dict key = index, value = weight
features_weights = {i:w for (i,w) in enumerate( clf.coef_[0] ) }

# Sort and print the list of weights
sorted_weights = sort_dict(features_weights)
print( sorted_weights )

# Reverse dictionnaries for labels and vocabulary

# Reverse dictionnaries for labels and vocabulary
# tag_to_idx = {class_name:class_idx,} e.g. {drama:1,comedy:0}
ix_to_tag = { v:k for k,v in tag_to_ix.items() }

# Look at the best features for each class
print( '\nBest features for identifying class 1, ie', ix_to_tag[1])
print( '\n'.join( [':'.join( (ix_to_tokens[i],str(w)) )for (w,i) in reversed( sorted_weights[-6:] )] ) )

print( '\nBest features for identifying class 0, ie', ix_to_tag[0])
print( '\n'.join( [':'.join( (ix_to_tokens[i],str(w)) ) for (w,i) in sorted_weights[:6]] ) )

**Feature selection**

The Chi-square test is used in statistics to test the independence of two events. In feature selection, we use it to test whether the occurrence of a specific term and the occurrence of a specific class are independent.

In [None]:
# Load libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# N features with highest chi-squared statistics are selected
chi2_features = SelectKBest(chi2, k = can be any number)
X = chi2_features.fit_transform(X, y)

In [None]:
sel = SelectKBest(chi2, k=5000)  # feature selection
sel.fit(X_train,y_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

**SKLearn pipeline object**

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

**Evaluating**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)