# IS-02 Machine Learning - Data and Web Science
## Lecture 9: Support Vector Machines
## Project 8 - SVM
### <i>Avgitidis Konstantinos </i>


In [1]:
import pandas as pd
from sklearn.svm import SVC
import logging
from time import time
from sklearn.preprocessing import StandardScaler
import pickle
from imblearn.pipeline import Pipeline as Pipe
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

In [2]:
X = pd.read_csv('creditcard.csv')

In [3]:
X.tail(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284797,172782.0,-0.241923,0.712247,0.399806,-0.463406,0.244531,-1.343668,0.929369,-0.20621,0.106234,...,-0.228876,-0.514376,0.279598,0.371441,-0.559238,0.113144,0.131507,0.081265,5.49,0
284798,172782.0,0.219529,0.881246,-0.635891,0.960928,-0.152971,-1.014307,0.427126,0.12134,-0.28567,...,0.099936,0.33712,0.251791,0.057688,-1.508368,0.144023,0.181205,0.215243,24.05,0
284799,172783.0,-1.775135,-0.004235,1.189786,0.331096,1.196063,5.51998,-1.518185,2.080825,1.159498,...,0.103302,0.65485,-0.348929,0.745323,0.704545,-0.127579,0.454379,0.130308,79.99,0
284800,172784.0,2.03956,-0.175233,-1.196825,0.23458,-0.008713,-0.726571,0.01705,-0.118228,0.435402,...,-0.268048,-0.717211,0.29793,-0.359769,-0.31561,0.201114,-0.080826,-0.075071,2.68,0
284801,172785.0,0.120316,0.931005,-0.546012,-0.745097,1.130314,-0.235973,0.812722,0.115093,-0.204064,...,-0.314205,-0.80852,0.050343,0.1028,-0.43587,0.124079,0.21794,0.068803,2.69,0
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [4]:
X.isna().any()

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

In [5]:
y = X['Class']
X.drop(columns=['Class','Time'], axis=1, inplace=True)

In [6]:
y.value_counts() #That's pretty imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [7]:
# define pipeline
#Create synthetic data
over = SVMSMOTE(sampling_strategy=0.1)
#Undersampling our data
under = RandomUnderSampler(sampling_strategy=0.3)
steps = [('o', over), ('u', under)]
#Initializing and fitting the pipeline
pipeline = Pipe(steps=steps)
X, y = pipeline.fit_resample(X, y)


In [8]:
#Much less data now
print(y.value_counts())

0    94770
1    28431
Name: Class, dtype: int64


In [9]:
try:
   grid_search = pickle.load( open( "GridsCVSVM.p", "rb" ) )
except:

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # This dataset is too high-dimensional. Let's do PCA:
    pca = PCA(n_components=8)

    # Maybe some original features where good, too?
    selection = SelectKBest(k=2)

    # Build estimator from PCA and Univariate selection:
    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    # Use combined features to transform dataset:
    X_features = combined_features.fit(X, y).transform(X)
    print("Combined space has", X_features.shape[1], "features")

    pipeline = Pipeline([('scale', StandardScaler()),("features", combined_features), ("clf", SVC())])

    #Our models
    parameters = [{
        'clf__C': [0.1],
        'clf__kernel': ['poly'],
        'clf__degree': [2],
        'clf__gamma': [0.2],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [10],
        'clf__kernel': ['poly'],
        'clf__degree': [5],
        'clf__gamma': [6],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [0.1],
        'clf__kernel': ['rbf'],
        'clf__gamma': [0.3],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [10],
        'clf__kernel': ['rbf'],
        'clf__gamma': [5],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [0.1],
        'clf__kernel': ['sigmoid'],
        'clf__gamma': [0.5],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [10],
        'clf__kernel': ['sigmoid'],
        'clf__gamma': [2],
        'clf__max_iter': [1001337],
    },{
        'clf__C': [100],
        'clf__kernel': ['sigmoid'],
        'clf__gamma': [5],
        'clf__max_iter': [1001337],
    },
    ]
    scoring = ["f1_micro","accuracy","precision","recall"]
    if __name__ == "__main__":
        grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, n_jobs=-1, refit = 'f1_micro',verbose=10)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipeline.steps])
        t0 = time()
        grid_search.fit(X, y)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        pickle.dump( grid_search, open( "GridsCVSVM.p", "wb" ) )

In [10]:
df = pd.DataFrame(grid_search.cv_results_)
# df['max_f1'] = df[max(['split0_test_f1_micro', 'split1_test_f1_micro', 'split2_test_f1_micro',
#        'split3_test_f1_micro', 'split4_test_f1_micro'])].values

In [11]:
svmdf = pd.DataFrame(data=df[['param_clf__C','param_clf__kernel',
                              'param_clf__gamma', 'param_clf__degree',
                              'mean_test_accuracy','mean_test_recall',
                              'mean_test_precision','mean_test_f1_micro',
       ]].values,columns=["C","Kernel","Gamma","Degree","Accuracy","Recall","Precision","F1"])

In [12]:
svmdf


Unnamed: 0,C,Kernel,Gamma,Degree,Accuracy,Recall,Precision,F1
0,0.1,poly,0.2,2.0,0.978726,0.915554,0.991654,0.978726
1,10.0,poly,6.0,5.0,0.875885,0.966974,0.831028,0.875885
2,0.1,rbf,0.3,,0.994034,0.980305,0.993801,0.994034
3,10.0,rbf,5.0,,0.992727,0.968839,0.999644,0.992727
4,0.1,sigmoid,0.5,,0.947639,0.884535,0.889994,0.947639
5,10.0,sigmoid,2.0,,0.945204,0.878872,0.884414,0.945204
6,100.0,sigmoid,5.0,,0.949019,0.892342,0.889082,0.949019


In [13]:
svmdf.to_excel('SVM_Results2.xlsx',index=False)
