In [47]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
dataset_features = pd.read_csv('Falcon9_Dataset_Part3.csv')

In [3]:
dataset_target = pd.read_csv('Falcon9_Dataset_Part2.csv')

In [4]:
Y = np.array(dataset_target['Class'])

In [5]:
Y

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [6]:
X = dataset_features

In [7]:
#split dataset to train and test set
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [8]:
X_train.shape

(72, 80)

In [9]:
X_test.shape

(18, 80)

In [10]:
Y_train.shape

(72,)

In [11]:
Y_test.shape

(18,)

In [12]:
#standardize the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
#train all models with lazypredict to see which model is best the we can improve that
clf = LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions = clf.fit(X_train,X_test,Y_train,Y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 39.50it/s]


In [14]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.94               0.92     0.92      0.94   
RidgeClassifier                    0.94               0.92     0.92      0.94   
BernoulliNB                        0.94               0.92     0.92      0.94   
XGBClassifier                      0.89               0.83     0.83      0.88   
NearestCentroid                    0.78               0.79     0.79      0.78   
LabelSpreading                     0.78               0.79     0.79      0.78   
LabelPropagation                   0.78               0.79     0.79      0.78   
GaussianNB                         0.78               0.79     0.79      0.78   
LinearDiscriminantAnalysis         0.83               0.79     0.79      0.83   
Perceptron                         0.83               0.75     0.75      0.81   
ExtraTreesClassifier        

### Conclusion: we can see these three models LGBMClassifier,RidgeClassifier and BernoulliNB has the best accuracy but in this project we want to train logistic regression,SVM,decision tree and KNN model and compare these models

# Logistic Regression Model

In [40]:
#create a logistic regression object
classifier_lr = LogisticRegression()
#define parameters for logistic regression grid search
params = {
    'C':[0.01,0.1,1],
    'penalty':['l2'],
    'solver':['lbfgs']
}
#create a logistic regression grid search object
logreg_cv = GridSearchCV(classifier_lr,
                  param_grid=params,cv=10,scoring='accuracy')
#train grid search object
logreg_cv.fit(X_train,Y_train)
#print best parameters and score for logistic regression
print('The best parameteres for Logistic regression are:{x} and best accuracy score is {y}'.format(x=logreg_cv.best_params_,y=logreg_cv.best_score_))

The best parameteres for Logistic regression are:{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} and best accuracy score is 0.8339285714285714


In [41]:
#create svm classifier object with best parameters
classifier_lr=LogisticRegression(C=0.1,penalty='l2',solver='lbfgs')
#train classifier
classifier_lr.fit(X_train,Y_train)
#predict values for test set
Y_predict = classifier_lr.predict(X_test)
# find and print accuracy in test set
logisticregression_accuracy = accuracy_score(Y_test,Y_predict)
print(logisticregression_accuracy)
#find and print confusion matrix
logisticregression_matrix=confusion_matrix(Y_test,Y_predict)
print(logisticregression_matrix)

0.8333333333333334
[[ 3  3]
 [ 0 12]]


# Support Vector Machine(SVM)

In [34]:
# create a svm object
svm_classifier = SVC()
#create a parameters for SVM grid search
params = {
    'kernel':('linear','rbf','poly','sigmoid'),
    'C':np.logspace(-3,3,5),
    'gamma':np.logspace(-3,3,5)
}
# create a svm grid search object
svm_cv = GridSearchCV(svm_classifier,
                     param_grid=params,
                     cv=10,
                     )
#train grid search object
svm_cv.fit(X_train,Y_train)
#find the best parameters and score
print('The best parameteres for SVM are:{x} and best accuracy score is {y}'.format(x=svm_cv.best_params_,y=svm_cv.best_score_))

The best parameteres for SVM are:{'C': 1.0, 'gamma': 0.03162277660168379, 'kernel': 'sigmoid'} and best accuracy score is 0.8482142857142858
None


In [42]:
#create svm object with best parameters
svm_classifier = SVC(C=1.0,gamma=0.03162277660168379,kernel='sigmoid')
#train classifier
svm_classifier.fit(X_train,Y_train)
#predict values of test set
Y_predict=svm_classifier.predict(X_test)
#print accuracy
svm_accuracy = accuracy_score(Y_test,Y_predict)
print(svm_accuracy)
#print confusion matrix
svm_matrix = confusion_matrix(Y_test,Y_predict)
print(svm_matrix)

0.8333333333333334
[[ 3  3]
 [ 0 12]]


# Decision Tree

In [44]:
#create a object for decision tree 
decisiontree_classifier= DecisionTreeClassifier()
#define parameters for decision tree grid search
params={
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[2*n for n in range(1,10)],
    'max_features':['auto','sqrt'],
    'min_samples_leaf':[1,2,4],
    'min_samples_split':[2,5,10]
}
#create a decision tree grid search object
tree_cv = GridSearchCV(decisiontree_classifier,
                      param_grid=params,
                      cv=10,
                      scoring='accuracy')
#train grid search object with training set
tree_cv.fit(X_train,Y_train)
#print the best parameters and score
print('The best parameteres for Decision Tree are:{x} and best accuracy score is {y}'.format(x=tree_cv.best_params_,y=tree_cv.best_score_))

The best parameteres for SVM are:{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'best'} and best accuracy score is 0.8607142857142858


In [46]:
#create a decision tree classifier with best parameters
decisiontree_classifier=DecisionTreeClassifier(criterion='gini',max_depth=4,max_features='auto',min_samples_leaf=2,min_samples_split=5,splitter='best')
#train decision tree classifer
decisiontree_classifier.fit(X_train,Y_train)
#predict values for test set
Y_predict = decisiontree_classifier.predict(X_test)
#find and print accuracy 
decisiontree_accuracy = accuracy_score(Y_test,Y_predict)
print(decisiontree_accuracy)
#find and print confusion matrix
decisiontree_matrix =confusion_matrix(Y_test,Y_predict)
print(decisiontree_matrix)

0.8333333333333334
[[ 3  3]
 [ 0 12]]


# KNN

In [49]:
#create a KNN classifier object
knn_classifier = KNeighborsClassifier()
#define parameters for grid search
params={
    'n_neighbors':[1,2,3,4,5,6,7,8,9,10],
    'algorithm':['auto','ball_tree','kd_tree','brute'],
    'p':[1,2]
}
#create a knn grid search object
knn_cv=GridSearchCV(knn_classifier,
                   param_grid=params,
                   cv=10)
#train grid search object
knn_cv.fit(X_train,Y_train)
#print best parameters and best score
print('The best parameteres for KNN are:{x} and best accuracy score is {y}'.format(x=knn_cv.best_params_,y=knn_cv.best_score_))

The best parameteres for KNN are:{'algorithm': 'auto', 'n_neighbors': 8, 'p': 1} and best accuracy score is 0.8321428571428571


In [None]:
#create a knn classifier object with best parameters
knn_classifier = KNeighborsClassifier(algorithm='auto',n_neighbors=8,p=1)
#train 