In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
data = pd.read_csv('./Prostate_Cancer.csv')

# print(data.shape)

# data.head()
# data.describe()
# data.info()

In [5]:
# drop column 'id' from the dataset to avoid overfitting
from sklearn.model_selection import train_test_split
data.drop(columns=['id'], axis=1, inplace=True)
train, test = train_test_split(data, test_size=0.2, random_state=122)
print('Training data: ', train.shape)
print('Test data: ', test.shape)

# Splitting the data into train and test
Xtrain = train.drop(columns=['diagnosis_result'], axis=1)
ytrain = train['diagnosis_result']

Xtest = test.drop(columns=['diagnosis_result'], axis=1)
ytest = test['diagnosis_result']

Training data:  (80, 9)
Test data:  (20, 9)


In [7]:
# data preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)



In [8]:
# feature Engineering
from sklearn.decomposition import PCA
pca = PCA()
Xtrain_pca = pca.fit_transform(Xtrain_scaled)
Xtest_pca = pca.transform(Xtest_scaled)

In [9]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
Xtrain_poly = poly.fit_transform(Xtrain_scaled)
Xtest_poly = poly.transform(Xtest_scaled)

In [10]:
! pip install imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

Xtrain_smote, ytrain_smote = smote.fit_resample(Xtrain_scaled, ytrain)
Xtrain_pca_smote, ytrain_pca_smote = smote.fit_resample(Xtrain_pca, ytrain)

Defaulting to user installation because normal site-packages is not writeable
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [15]:
# model training
from sklearn.model_selection import cross_val_score
# imort Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
scores = cross_val_score(model, Xtrain_scaled, ytrain, cv=5, scoring='accuracy')
print('Logistic Regression: ', scores.mean())
# print the accuracy of the model
model.fit(Xtrain_scaled, ytrain)
print('Logistic Regression: ', model.score(Xtest_scaled, ytest))

# PCA
model = LogisticRegression()
scores = cross_val_score(model, Xtrain_pca, ytrain, cv=5, scoring='accuracy')

print('PCA: ', scores.mean())
model.fit(Xtrain_pca, ytrain)
print('PCA: ', model.score(Xtest_pca, ytest))

# Polynomial
model = LogisticRegression()
scores = cross_val_score(model, Xtrain_poly, ytrain, cv=5, scoring='accuracy')
print('Polynomial: ', scores.mean())
model.fit(Xtrain_poly, ytrain)

print('Polynomial: ', model.score(Xtest_poly, ytest))

# pca and scaling
model = LogisticRegression()
scores = cross_val_score(model, Xtrain_pca_smote, ytrain_pca_smote, cv=5, scoring='accuracy')

print('PCA and SMOTE: ', scores.mean())
model.fit(Xtrain_pca_smote, ytrain_pca_smote)
print('PCA and SMOTE: ', model.score(Xtest_pca, ytest))




Logistic Regression:  0.775
Logistic Regression:  0.95
PCA:  0.775
PCA:  0.95
Polynomial:  0.7375
Polynomial:  0.8
PCA and SMOTE:  0.8363157894736842
PCA and SMOTE:  0.95
