# Jared Mlekush - Project

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection        import train_test_split, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.impute                 import SimpleImputer
from sklearn.preprocessing          import StandardScaler, OneHotEncoder
from sklearn.compose                import ColumnTransformer
from sklearn.pipeline               import Pipeline
from sklearn.linear_model           import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.neighbors              import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble               import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.tree                   import DecisionTreeClassifier
from sklearn.metrics                import f1_score
from sklearn.base                   import BaseEstimator
from sklearn.decomposition          import PCA
from sklearn.cluster                import KMeans
from sklearn.cluster                import SpectralClustering
from sklearn                        import metrics
from scipy.spatial.distance         import cdist
from sklearn.metrics                import accuracy_score

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [158]:
# Read in data and drop duplicates
pokemon_df = pd.read_csv('pokemon.csv')
pokemon_df.drop_duplicates()

# Make Supervised Learning problem by taking a column from the dataframe and making it our target variable
y = pokemon_df['Generation']
X = pokemon_df.drop('Generation', axis=1)

In [163]:
# Split up the data - create a train, validation and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train_2, X_validation, y_train_2, y_validation = train_test_split(X_train, y_train, test_size=0.25)

In [160]:
# Define specific columns to be fed into the pipelines. Different pipelines to deal with different types of variables- leaving out columns that don't make sense to include
num_columns = ['Number','Total', 'HP', 'Attack', 'Defense', 'Sp_Atk', 'Sp_Def', 'Speed', 'Height_m', 'Catch_Rate']
cat_columns = ['Name', 'Type_1', 'Type_2', 'Color', 'Egg_Group_1', 'Egg_Group_2', 'Body_Style']

In [161]:
# To deal with categorical variables - utilizes One Hot Encoding 
cat_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                      ('ohe', OneHotEncoder(handle_unknown='ignore'))])

# To deal with numerical variables - utilize Standard Scaler
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                      ('scaler', StandardScaler())])

# Put together in column transformer and take care of both data types at the same time
preprocessing = ColumnTransformer([('categorical', cat_pipe,  cat_columns),
                                   ('numerical',  num_pipe, num_columns)])


In [162]:
# Now in the format we want - fit on several different algorithms, and assess baseline models
algorithms = algos = [RandomForestClassifier(),
             ExtraTreesClassifier(),
             IsolationForest(),
             LogisticRegression(),
             PassiveAggressiveClassifier(),
             RidgeClassifier(),
             SGDClassifier()]

results = dict()

for algo in algorithms:
    pipe = Pipeline([('preprocessing', preprocessing), 
                     ('clf', algo)])

    pipe.fit(X_train_2, y_train_2)
    y_pred = pipe.predict(X_validation)
    f1_test  = f1_score(y_validation, y_pred, average='weighted')
    accuracy = accuracy_score(y_validation, y_pred)
    print(f"{algo.__class__.__name__:<17} - f1_score: {f1_test:,.2f}")
    print(f"{algo.__class__.__name__:<17} - accuracy: {accuracy:,.2f}\n")

RandomForestClassifier - f1_score: 0.71
RandomForestClassifier - accuracy: 0.73

ExtraTreesClassifier - f1_score: 0.62
ExtraTreesClassifier - accuracy: 0.64

IsolationForest   - f1_score: 0.04
IsolationForest   - accuracy: 0.16

LogisticRegression - f1_score: 0.69
LogisticRegression - accuracy: 0.70

PassiveAggressiveClassifier - f1_score: 0.58
PassiveAggressiveClassifier - accuracy: 0.59

RidgeClassifier   - f1_score: 0.46
RidgeClassifier   - accuracy: 0.50

SGDClassifier     - f1_score: 0.56
SGDClassifier     - accuracy: 0.57



In [49]:
# Create a pipeline with a "placeholder". Will use pipeline to feed in particular algorithms and their hyperparametes in "search_space"
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

pipe = Pipeline([('preprocessing', preprocessing), 
                 ('clf', DummyEstimator())])

In [50]:
# Algorithms and their respective hyperparameters to be searched through
search_space = [{'clf': [LogisticRegression()], 
                 'clf__penalty': ['l1', 'l2'],
                 'clf__C': np.logspace(0, 4, 10),
                 'clf__multi_class' : ['auto', 'multinomial', 'ovr'],
                 'clf__solver' : ['newton-cg', 'lbfgs'],
                 'clf__class_weight' : ['balanced', None]},
                
                {'clf': [ExtraTreesClassifier()],
                'clf__bootstrap': [False,True],
                'clf__class_weight': [None, 'balanced'],
                'clf__criterion': ['gini', 'entropy'],
                'clf__max_depth': [50, 100, 150, 200, 500],
                'clf__max_features': ['auto','sqrt', 'log2'],
                'clf__min_samples_leaf': [1,2,3],
                'clf__min_weight_fraction_leaf': [0.0, 0.1],
                'clf__n_estimators': [10,100,500]},

                {'clf': [PassiveAggressiveClassifier()],
                'clf__max_iter': [100, 500, 1000, 1500],
                'clf__early_stopping': [False,True],
                'clf__shuffle': [False,True],
                'clf__warm_start': [False,True]}
                ]

In [51]:
# RandomizedSearchCV: fixed number of parameter settings is sampled from the specified distributions, number of samples tried is dictated by n_iter
clf_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=100,
                                    cv=10, 
                                    n_jobs=-1,
                                    verbose=1)

In [52]:
# Fit search
best_model = clf_algos_rand.fit(X_train_2, y_train_2);

# View best model
best_model.best_estimator_.get_params()['clf']


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


LogisticRegression(C=464.15888336127773, multi_class='multinomial',
                   solver='newton-cg')

In [111]:
# After Iterating through the search space and fitting a total of 1000 times (in my case specifically), best model is extracted and used (with hyper-parameters tuned)
pipe = Pipeline([('preprocessing', preprocessing),
                ('lr', LogisticRegression(C=464.15888336127773, class_weight='balanced', multi_class='multinomial', solver='newton-cg'))])

In [113]:
# Harmonic mean - since there is a the data is slightly unbalanced (though not bad)
cv_acc_score = cross_val_score(pipe, X_train, y_train, cv=25,scoring='f1_weighted')
print("Mean 20-Fold F1: {}".format(np.mean(cv_acc_score)))
# Prediction of generation pokemon is from is of trivail importance. Using accuracy is an appropriate measure.
cv_acc_score = cross_val_score(pipe, X_train, y_train, cv=25,scoring='accuracy')
print("Mean 20-Fold accuracy: {}".format(np.mean(cv_acc_score)))

Mean 20-Fold F1: 0.7902681858785755
Mean 20-Fold accuracy: 0.7979220779220779


In [101]:
# Apply clustering
pipe = Pipeline([('preprocessing', preprocessing),
                ('k-mean', KMeans(n_clusters=12)),
                ('clm',  LogisticRegression(C=464.15888336127773, class_weight='balanced', multi_class='multinomial', solver='newton-cg'))])

In [103]:
# Test out f1_score and accuracy on the newly clusterd data
cv_f1_score = cross_val_score(pipe, X_train, y_train, cv=25,scoring='f1_weighted')
print("Mean 20-Fold f1_score: {}".format(np.mean(cv_f1_score)))
cv_acc_score = cross_val_score(pipe, X_train, y_train, cv=25,scoring='accuracy')
print("Mean 20-Fold accuracy: {}".format(np.mean(cv_acc_score)))

Mean 20-Fold f1_score: 0.8657512290499304
Mean 20-Fold accuracy: 0.8457142857142859


In [104]:
# Final model
final_model = LogisticRegression(C=464.15888336127773,         # C is inverse of regularization strength. Lower number = stronger regularization
                  class_weight='balanced',                     # class_weight= 'balanced' uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data
                  multi_class='multinomial',                   # multi_class= 'multinomial' lets us use logistic regression for a multiclass problem not just binary class
                  solver='newton-cg')                          # solver='newton-cg' uses that algorithm/ minimizer

In [118]:
# Final pipeline
pipe = Pipeline([('preprocessing', preprocessing),
                ('k-mean', KMeans(n_clusters=15)),
                ('clm',  final_model)])

In [119]:
# Final metrics
cv_f1_score = cross_val_score(pipe, X_test, y_test, cv=25,scoring='f1_weighted')
print("Mean 20-Fold f1_score: {}".format(np.mean(cv_f1_score)))
cv_acc_score = cross_val_score(pipe, X_test, y_test, cv=25,scoring='accuracy')
print("Mean 20-Fold accuracy: {}".format(np.mean(cv_acc_score)))

Mean 20-Fold f1_score: 0.6820714285714286
Mean 20-Fold accuracy: 0.7235714285714286


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ccb00087-ba61-4db1-ba97-3a0ee74eb21a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>