#### Tour of ML Classifiers using scikit-Learn

In [1]:
from init import *
print(f"sklearn version:: {sklearn.__version__}")

sklearn version:: 1.4.0


##### Popular algorithms for classification
-   logistic regression
-   svm
-   decision trees
-   k-nearest neighbors


Note: *no free lunch theorem* by David H. Wolpert, no single classifier works best across all possible scenarios 
*(The Lack of A Priori Distinctions Between Learning Algorithms, Wolpert, David H, Neural Computation*)

In [2]:
iris = pd.read_csv('../assets/ML/iris.csv',header=None)

# TODO: tweak IRIS dataset into proper format 
def tweak_iris(df_:pd.DataFrame)->pd.DataFrame:
    return(
        df_.rename(columns={0: 'Petal Length',
                            1: 'Petal Width',
                            2: 'Sepal Length',
                            3: 'Sepal Width ',
                            4: 'Species Type'}).astype({'Species Type':'category'})
    )
iris = iris.pipe(tweak_iris)
X = iris.drop('Species Type',axis=1)
y = iris['Species Type']
print('Class labels:', np.unique(y))

# PEP-0526
X_train:pd.DataFrame
X_test:pd.DataFrame
y_train:pd.Series
y_test:pd.Series
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,stratify=y,test_size=0.33)
print(X_train.shape,X_test.shape)




# TODO:calculate mean and variance
#  z = (x - u) / s;;   u=mean, s=std deviation
std:StandardScaler = StandardScaler()

# monkey patching (not recommended)
std.n_samples_seen_=10_000                                     
std.feature_names_in_ = ['Petal Length', 'Petal Width', 'Sepal Length', 'Sepal Width ']


std.fit(X_train)
pprint(std.__dict__,sort_dicts=False,indent=4,depth=2)


# transform the data
X_train:np.ndarray = std.transform(X_train)
X_test:np.ndarray  = std.transform(X_test)

Class labels: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
(100, 4) (50, 4)
{   'with_mean': True,
    'with_std': True,
    'copy': True,
    'n_samples_seen_': 10100,
    'feature_names_in_': [   'Petal Length',
                             'Petal Width',
                             'Sepal Length',
                             'Sepal Width '],
    'mean_': array([0.05781188, 0.03045545, 0.03711881, 0.01180198]),
    'var_': array([0.34134987, 0.09456752, 0.16978556, 0.01978151]),
    'scale_': array([0.58425154, 0.30751832, 0.41205043, 0.14064674])}


In [3]:
ppn = Perceptron(eta0=0.01,penalty='elasticnet',n_jobs=-1,warm_start=True,early_stopping=False,verbose=False,random_state=1)
pprint(ppn)
pprint(ppn.__dict__)
print(list(func for func in dir(Perceptron) if callable(getattr(Perceptron, func)) and (not func.startswith("__") and not func.startswith('_'))))

# TODO: Fit linear model with Stochastic Gradient Descent.
ppn.fit(X_train,y_train)
y_pred:np.ndarray = ppn.predict(X_test)

print(f'MISCLASSIFIED EXAMPLES: {(y_test != y_pred).sum()}')
pprint(f"ACCURACY SCORE: {(y_test==y_pred).sum()/y_pred.shape[0]}")

Perceptron(eta0=0.01, n_jobs=-1, penalty='elasticnet', random_state=1,
           verbose=False, warm_start=True)
{'C': 1.0,
 'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'constant',
 'loss': 'perceptron',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': -1,
 'penalty': 'elasticnet',
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': True}
['decision_function', 'densify', 'fit', 'get_metadata_routing', 'get_params', 'partial_fit', 'predict', 'score', 'set_fit_request', 'set_params', 'set_partial_fit_request', 'set_score_request', 'sparsify']
Misclassified examples: 6
'accuracy score: 0.88'


In [35]:
print(ppn.intercept_)
print(ppn.coef_)

[ 0.01  0.14 -0.21]
  (0, 0)	0.041670687561067316
  (0, 1)	0.12565242003294055
  (0, 2)	-0.12934490948124372
  (0, 3)	-0.1428501713626379
  (1, 0)	0.3897877166160877
  (1, 1)	-0.4784377163913084
  (1, 2)	0.4723784672530884
  (1, 3)	-0.3526618096615371
  (2, 0)	-0.7277532011701506
  (2, 1)	-1.1742774203180775
  (2, 2)	0.7581972259762504
  (2, 3)	1.1821338205708152
