In [3]:
# Setting up methods and packages
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from distutils.version import LooseVersion
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Setting up dataframe from directory
df = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/train.csv')

In [5]:
# Dropping non-informative and target columns from X and assigning target column to Y
X = df.drop(columns=["id", "label"])
y = df['label']

# Splitting the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)

In [5]:
# Gathering brief summary of dataset
df.head

<bound method NDFrame.head of            id label  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    90524101     M        17.99         20.66          117.80      991.7   
1    84358402     M        20.29         14.34          135.10     1297.0   
2       89346     B         9.00         14.40           56.36      246.3   
3      902975     B        12.21         14.09           78.78      462.0   
4      904969     B        12.34         14.95           78.29      469.1   
..        ...   ...          ...           ...             ...        ...   
450    866674     M        19.79         25.12          130.40     1192.0   
451    869254     B        10.75         14.97           68.26      355.3   
452    859717     M        17.20         24.52          114.20      929.4   
453  88249602     B        14.03         21.25           89.79      603.4   
454    854941     B        13.03         18.42           82.61      523.8   

     smoothness_mean  compactness_mean  conca

In [6]:
# Setting up the perceptron model
ppn = Perceptron(eta0=0.1, random_state=1)

# Fitting the perceptron model via scikit-learn
ppn.fit(X_train, y_train)

# Making predictions
y_pred = ppn.predict(X_test)

# Setting up accuracy score output
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

# While still producing a somewhat feasible accuracy score, the perceptron 
# model suffers from its simplistic nature. Sensitivity to feature scaling, 
# inability to approximate non-linear functions, and convergence issues
# limit the model to create an adequate accuracy score. 

Accuracy: 0.679


In [22]:
#Train logistic regression model. Set Max iterations to 3000 to prevent convergence warning
lrmodel = LogisticRegression(max_iter=3000)
lrmodel.fit(X_train, y_train)

# Making predictions of the model
lr_pred = lrmodel.predict(X_test)

# Setting up accuracy score
lraccuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy ", {lraccuracy})

# Load the test data from test CSV file
test_data = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/test.csv')

# Drop the same columns from the test data
test_data = test_data.drop('id', axis=1)

# Make predictions on the test data
predictions = lrmodel.predict(test_data)

# Create a new DataFrame from predictions conducted
new_predictions_df = pd.DataFrame({'Predictions': predictions})

# Reload the test.csv DataFrame to add back id column
test_data2 = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/test.csv')

# Add the id column from test_data2 to the new DataFrame
new_predictions_df.insert(0, 'id', test_data2['id'])

# Save the predictions to a new CSV file
new_predictions_df.to_csv('prediction.csv', index=False)

# Print new DataFrame
print(new_predictions_df)

# Logistic Regression makes for a strong contender in this evaluation, 
# as it able to work well with large datasets, can be regularized to avoid 
# overfitting, and is simple to interpret. However, in this case, Logistic 
# Regression is only slightly edged out by Random Forest in terms of 
# accuracy score when performing the final evaluation. This is further 
# explained in Random Forest's section. 

Logistic Regression Accuracy  {0.9635036496350365}
           id Predictions
0      906564           M
1       85715           B
2      891670           B
3      874217           M
4      905680           M
..        ...         ...
109     87164           M
110  84348301           M
111    859471           B
112    911150           B
113  90944601           B

[114 rows x 2 columns]


In [29]:
# Setting up the SVM model
# Additionally setting regularization parameters gamma and C for decision boundaries
svm = SVC(kernel='rbf', random_state=1, gamma=100.0, C=1.0)

# Fitting the SVM model via scikit-learn
svm.fit(X_train, y_train)

# Making predictions
svm_pred = svm.predict(X_test)

# Setting up accuracy score
svmaccuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy ", {svmaccuracy})

# SVM's accuracy score is similar to Perceptron's, which was surprising. However, 
# some reasons for this could be due to feature scaling. If the scaling was more 
# optimally tuned, perhaps the accuracy could improve. Additionally, overfitting 
# may occur, especially with smaller datasets with less observations. These reasons
# could explain the lower accuracy score when compared with the other models. 

SVM Accuracy  {0.6496350364963503}


In [30]:
# Setting up the Decision Tree model
# Additionally setting parameters to check for gini impurities and setting max depth of decision tree
tree_model = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=4, 
                                    random_state=1)

# Fitting the Decision Tree model via scikit-learn
tree_model.fit(X_train, y_train)

# Making predictions
tree_pred = tree_model.predict(X_test)

# Setting up accuracy score 
treeaccuracy = accuracy_score(y_test, tree_pred)
print("Tree Accuracy ", {treeaccuracy})

# While the Decision Tree's accuracy score is a markedly larger improvement over SVM's, 
# it is still not quite enough to be considered the best model for the final evaluation. 
# One reason for its lower score could be due to it being impaired by overfitting. 

Tree Accuracy  {0.9343065693430657}


In [31]:
# Setting up the KNN model
# Additionally setting parameters of Minkowski distance and specifying number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           metric='minkowski')

# Fitting the KNN model via scikit-learn
knn.fit(X_train, y_train)

# Making predictions
knn_pred = knn.predict(X_test)

# Setting up accuracy score
knnaccuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy ", {knnaccuracy})

# Like the Decision Tree model, KNN also produces serviceable results, albeit lower than desired. 
# The reasons for this remain the same as before - overfitting and a lack of feature scability. 

KNN Accuracy  {0.9124087591240876}


In [6]:
#### This model was selected as the final choice for predicting the test dataset ####

# Setting up the Random Forest model
# Additionally setting parameters for number of estimators and jobs
forest = RandomForestClassifier(n_estimators=25, 
                                random_state=1,
                                n_jobs=2)

# Fitting the Random Forest model via scikit-learn
forest.fit(X_train, y_train)

# Making predictions
forest_pred = forest.predict(X_test)

# Setting up accuracy score
forestaccuracy = accuracy_score(y_test, forest_pred)
print("Forest Accuracy ", {forestaccuracy})

# Load the test data from test CSV file
test_data = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/test.csv')

# Drop the same columns from the test data
test_data = test_data.drop('id', axis=1)

# Make predictions on the test data
predictions = forest.predict(test_data)

# Create a new DataFrame from predictions conducted
new_predictions_df = pd.DataFrame({'Predictions': predictions})

# Reload the test.csv DataFrame to add back id column
test_data2 = pd.read_csv('/kaggle/input/mini-kaggle2-dataset/test.csv')

# Add the id column from test_data2 to the new DataFrame
new_predictions_df.insert(0, 'id', test_data2['id'])

# Save the predictions to a new CSV file
new_predictions_df.to_csv('prediction.csv', index=False)

# Print new DataFrame
print(new_predictions_df)

# While the logistic regression performed better in the training portion of the evaluation, 
# the results show that Random Forest fared better overall in terms of accuracy. Some possible 
# reasons for this could be the Random Forest classification's ability to handle non-linearity 
# more efficiently compared to Logistic Regression, which assumes a linear relationship between 
# the features. It can also handle outliers and overfitting more efficiently when compared to 
# Logistic Regression, which could also explain its better accuracy. 

Forest Accuracy  {0.9562043795620438}
           id Predictions
0      906564           B
1       85715           M
2      891670           B
3      874217           M
4      905680           B
..        ...         ...
109     87164           M
110  84348301           B
111    859471           B
112    911150           B
113  90944601           B

[114 rows x 2 columns]


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session