In [None]:
%matplotlib inline
from IPython.display import clear_output
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import time

from sklearn.datasets import make_blobs, make_moons
from google.colab import files

# Part 2. Classification


Predict breast cancer based on features. 

### Data investigation

- Analyse distribution of features and target variables. Have a look on statistics. As visualization. For continuous features you can use histograms and box plots, for categorical bar charts.
- Investigate outliers. 
- Analyze correlations between features and target.

In [None]:
df = pd.read_csv("cancer-classification.csv")
df = df.drop(columns=["Unnamed: 0", "Id"])

In [None]:
df

In [None]:

df.hist(figsize=(20, 20))

In [None]:
df.describe()

In [None]:
df.dtypes
df = df.drop(columns={"Bare Nuclei"})

In [None]:
corr = df.corr()
#Plot figsize
fig, ax = plt.subplots(figsize=(10, 8))
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
plt.show()

### Data preprocessing
- Convert non-numerical features to dummy variables. 
- Scale features or target if necessary.
- Split dataset into train, test and cross validation sets.

In [None]:
# Splitting the dataset into the Training set and Test set
X = df.drop(['Class', 'Uniformity of Cell Shape'], axis = 1).values
y = df['Class'].values.reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

### Training
- Train logistic regression, evaluate on validation set, play with hyper parameters.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from copy import deepcopy
X_train1, X_test1, y_train1, y_test1 = deepcopy(X_train), deepcopy(X_test), deepcopy(y_train), deepcopy(y_test)

clf1 = LogisticRegression().fit(X_train1, y_train1)
y_pred_LR = clf1.predict(X_test1)

print(f"precision_score:\t{precision_score(y_test1, y_pred_LR, average='macro')}")
print(f"recall_score:\t\t{recall_score(y_test1, y_pred_LR, average='macro')}")
print(f"f1_score:\t\t{f1_score(y_test1, y_pred_LR, average='macro')}")



- Train SVM, evaluate on validation set, play with hyper parameters.

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X_train1, X_test1, y_train1, y_test1 = deepcopy(X_train), deepcopy(X_test), deepcopy(y_train), deepcopy(y_test)

clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_test1, y_test1)
y_pred_LR = clf.predict(X_test1)

print(f"precision_score:\t{precision_score(y_test1, y_pred_LR, average='macro')}")
print(f"recall_score:\t\t{recall_score(y_test1, y_pred_LR, average='macro')}")
print(f"f1_score:\t\t{f1_score(y_test1, y_pred_LR, average='macro')}")


- Train Gradient Boosting Classifier, evaluate on validation set, play with hyper parameters.

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)


- Check precision, recall, F1-score metrics, what is more important for you? 

- Chose best model based on validation and test set.

In [None]:
the best model is SVM

- Write which experiments give you the best results. 

The experiment with SVM gives us the best result :-)and Logistic regression the worst one