# Classification

## Solve for Multiclass Classification Problems

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification

In [4]:
X, y = make_classification(n_samples=2000,n_features=12,n_informative=8,n_redundant=4,n_classes=4, random_state=42)
X.shape, y.shape

((2000, 12), (2000,))

In [5]:
np.unique(y)

array([0, 1, 2, 3])

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ss = StandardScaler()
ss.fit(X_train,y_train)
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

rfc_clf = RandomForestClassifier(random_state=42,n_estimators=500)
classifier = rfc_clf.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85  8  1  5]
 [ 3 82  6  4]
 [12  5 78  7]
 [ 6  7  7 84]]
0.8225
              precision    recall  f1-score   support

           0       0.80      0.86      0.83        99
           1       0.80      0.86      0.83        95
           2       0.85      0.76      0.80       102
           3       0.84      0.81      0.82       104

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



### One-vs-Rest for Multiclass Classification

In this type of classification, we split the multiclass classification problem into N number of binary classification problems where N is the number of output number. 

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
clf = OneVsRestClassifier(log_clf)
classifier = clf.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[66 11 13  9]
 [ 9 67 10  9]
 [19 17 53 13]
 [15 13 17 59]]
0.6125
              precision    recall  f1-score   support

           0       0.61      0.67      0.63        99
           1       0.62      0.71      0.66        95
           2       0.57      0.52      0.54       102
           3       0.66      0.57      0.61       104

    accuracy                           0.61       400
   macro avg       0.61      0.61      0.61       400
weighted avg       0.61      0.61      0.61       400



### One-vs-One for Multiclass Classification

In 1-vs-1 classification, a model is trained, which makes a binary classification btw each pair of classes. The class with the highest number of classification is selected on the basis of voting and is labeled as the final predicted output classes. 

In [12]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
clf = OneVsOneClassifier(log_clf)
classifier = clf.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[67  7 15 10]
 [ 5 70 11  9]
 [18 15 54 15]
 [11 11 17 65]]
0.64
              precision    recall  f1-score   support

           0       0.66      0.68      0.67        99
           1       0.68      0.74      0.71        95
           2       0.56      0.53      0.54       102
           3       0.66      0.62      0.64       104

    accuracy                           0.64       400
   macro avg       0.64      0.64      0.64       400
weighted avg       0.64      0.64      0.64       400



## Multilabel Classification Problems

In [16]:
from sklearn.datasets import make_multilabel_classification

X, y = make_multilabel_classification(n_samples=2000,n_features=10,n_classes=5,n_labels=3, random_state=42)
X.shape, y.shape

((2000, 10), (2000, 5))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ss = StandardScaler()
ss.fit(X_train,y_train)
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [19]:
rfc_clf = RandomForestClassifier(random_state=42,n_estimators=500)
classifier = rfc_clf.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.365
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       186
           1       0.86      0.95      0.90       295
           2       0.83      0.91      0.87       255
           3       0.75      0.91      0.82       243
           4       0.74      0.38      0.51       120

   micro avg       0.80      0.83      0.82      1099
   macro avg       0.79      0.78      0.77      1099
weighted avg       0.80      0.83      0.81      1099
 samples avg       0.83      0.84      0.81      1099



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
clf = OneVsRestClassifier(log_clf)
classifier = clf.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.275
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       186
           1       0.86      0.89      0.88       295
           2       0.81      0.84      0.83       255
           3       0.74      0.83      0.78       243
           4       0.72      0.39      0.51       120

   micro avg       0.80      0.79      0.79      1099
   macro avg       0.78      0.74      0.75      1099
weighted avg       0.79      0.79      0.78      1099
 samples avg       0.80      0.81      0.77      1099



  _warn_prf(average, modifier, msg_start, len(result))


## Notes about Metrics
- if balance, then use accuracy. 
- if imbalance, then use F1. 