In [2]:
import numpy as np
import pandas as pd
import sklearn
import datetime
from sklearn import datasets, linear_model, metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
%matplotlib inline

In [None]:
# Part 1

# Accuracy means how close our full set of guesses are to being correct. If our guesses are primarily correct, that means we have a high accuracy.
# Precision does not necessitate that our guesses are correct, but that they're closely clustered. High-precision guesses may have low accuracy, but be closely-packed, giving high precision.

In [None]:
# The difference between precision and recall is item relevancy when faced with selecting instances from a larger set.
# Precision measures how many of the selected instances are relevant, whereas Recall measures how many relevant instances are selected, compared to the full set of relevant instances available.

In [4]:
df = pd.read_csv('cancer.csv', sep=',')

df.head(10)

Unnamed: 0,ID,Diagnosis,Radius1,Texture1,Perimeter1,Area1,Smoothness1,Compactness1,Concavity1,Concave Points1,...,Radius3,Texture3,Perimeter3,Area3,Smoothness3,Compactness3,Concavity3,Concave Points3,Symmetry3,Fractal Dimension3
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [13]:
logreg = linear_model.LogisticRegression()

descriptors = ['Perimeter1','Area1','Compactness1','Concavity1','Texture1','Symmetry1']

X = df[descriptors].values.reshape(-1,len(descriptors))
Y = df['Diagnosis']

folds = KFold(n_splits=10)

accuracies = []
    
print()

for train_idx, test_idx in folds.split(X, Y):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]

    logreg.fit(X_train, Y_train)

    pred = cross_val_predict(logreg,X_test,Y_test, cv=10)

    accuracy = metrics.classification_report(Y_test, pred, target_names=["Benign","Malignant"])

    accuracies.append(accuracy)

    print(accuracy)
        


             precision    recall  f1-score   support

     Benign       0.50      0.27      0.35        11
  Malignant       0.84      0.93      0.89        46

avg / total       0.78      0.81      0.78        57

             precision    recall  f1-score   support

     Benign       0.92      0.94      0.93        35
  Malignant       0.90      0.86      0.88        22

avg / total       0.91      0.91      0.91        57

             precision    recall  f1-score   support

     Benign       0.87      0.94      0.91        36
  Malignant       0.89      0.76      0.82        21

avg / total       0.88      0.88      0.87        57

             precision    recall  f1-score   support

     Benign       0.93      0.90      0.91        29
  Malignant       0.90      0.93      0.91        28

avg / total       0.91      0.91      0.91        57

             precision    recall  f1-score   support

     Benign       0.90      0.93      0.92        29
  Malignant       0.93      0.89

In [None]:
# With one exception, our overall scores hover around 90% accuracy, meaning the testing data both accurately predicts malignancy, and does so nearly unanimously for the most part.
# Also, the support numbers show us that the size of the sample is not an accurate determinator of F1-score, as malignancy accuracy varies quite heavily in the two last reports, despite the same number of points.

In [17]:
# Part 2

brain_df = pd.read_csv('brain_size.csv', delimiter=';')

brain_df

Unnamed: 0.1,Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
0,1,Female,133,132,124,118,64.5,816932
1,2,Male,140,150,124,.,72.5,1001121
2,3,Male,139,123,150,143,73.3,1038437
3,4,Male,133,129,128,172,68.8,965353
4,5,Female,137,132,134,147,65.0,951545
5,6,Female,99,90,110,146,69.0,928799
6,7,Female,138,136,131,138,64.5,991305
7,8,Female,92,90,98,175,66.0,854258
8,9,Male,89,93,84,134,66.3,904858
9,10,Male,133,114,147,172,68.8,955466
