In [23]:
# Setting up methods and packages
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
import pandas as pd
from distutils.version import LooseVersion
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [24]:
df = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/train.csv')

In [25]:
# Dropping non-informative and target columns from X and assigning target column to Y
X = df.drop(columns=["id", "label"])
y = df['label']

In [26]:
# Splitting the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)

In [34]:
# Evaluating type of data for each column
print(X_train.dtypes)
print(y_train.dtypes)

# Gathering brief summary of dataset
df.head

radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst      

<bound method NDFrame.head of            id label  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    90524101     M        17.99         20.66          117.80      991.7   
1    84358402     M        20.29         14.34          135.10     1297.0   
2       89346     B         9.00         14.40           56.36      246.3   
3      902975     B        12.21         14.09           78.78      462.0   
4      904969     B        12.34         14.95           78.29      469.1   
..        ...   ...          ...           ...             ...        ...   
450    866674     M        19.79         25.12          130.40     1192.0   
451    869254     B        10.75         14.97           68.26      355.3   
452    859717     M        17.20         24.52          114.20      929.4   
453  88249602     B        14.03         21.25           89.79      603.4   
454    854941     B        13.03         18.42           82.61      523.8   

     smoothness_mean  compactness_mean  conca

In [28]:
# Setting up the perceptron model
ppn = Perceptron(eta0=0.1, random_state=1)

# Fitting the perceptron model via scikit-learn
ppn.fit(X_train, y_train)

# Making predictions
y_pred = ppn.predict(X_test)

# Setting up accuracy score 
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.679


In [29]:
# Setting up the logistic regression model 
# Set max iterations to 3000 to prevent convergence warnings
lrmodel = LogisticRegression(max_iter=3000)

# Fitting the logistic regression model via scikit-learn
lrmodel.fit(X_train, y_train)

# Making predictions
lr_pred = lrmodel.predict(X_test)

# Setting up accuracy score
lraccuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy ", {lraccuracy})

Logistic Regression Accuracy  {0.9635036496350365}


In [30]:
# Setting up the SVM model
# Additionally setting regularization parameters gamma and C for decision boundaries
svm = SVC(kernel='rbf', random_state=1, gamma=100.0, C=1.0)

# Fitting the SVM model via scikit-learn
svm.fit(X_train, y_train)

# Making predictions
svm_pred = svm.predict(X_test)

# Setting up accuracy score
svmaccuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy ", {svmaccuracy})

SVM Accuracy  {0.6496350364963503}


In [31]:
# Setting up the Decision Tree model
# Additionally setting parameters to check for gini impurities and setting max depth of decision tree
tree_model = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=4, 
                                    random_state=1)

# Fitting the Decision Tree model via scikit-learn
tree_model.fit(X_train, y_train)

# Making predictions
tree_pred = tree_model.predict(X_test)

# Setting up accuracy score
treeaccuracy = accuracy_score(y_test, tree_pred)
print("Tree Accuracy ", {treeaccuracy})

Tree Accuracy  {0.9343065693430657}


In [32]:
# Setting up the KNN model
# Additionally setting parameters of Minkowski distance and specifying number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           metric='minkowski')

# Fitting the KNN model via scikit-learn
knn.fit(X_train, y_train)

# Making predictions
knn_pred = knn.predict(X_test)

# Setting up accuracy score
knnaccuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy ", {knnaccuracy})

KNN Accuracy  {0.9124087591240876}


In [33]:
# Setting up the Random Forest model
# Additionally setting parameters for number of estimators and jobs
forest = RandomForestClassifier(n_estimators=25, 
                                random_state=1,
                                n_jobs=2)

# Fitting the Random Forest model via scikit-learn
forest.fit(X_train, y_train)

# Making predictions
forest_pred = forest.predict(X_test)

# Setting up accuracy score
forestaccuracy = accuracy_score(y_test, forest_pred)
print("Forest Accuracy ", {forestaccuracy})

Forest Accuracy  {0.9562043795620438}


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session