# Part 1： Modelling

In [82]:
import requests
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier 

# Breast cancer header and data file links
header_url = "https://gist.githubusercontent.com/jeff-boykin/b5c536467c30d66ab97cd1f5c9a3497d/raw/5233c792af49c9b78f20c35d5cd729e1307a7df7/field_names.txt"
data_url = "https://gist.githubusercontent.com/jeff-boykin/b5c536467c30d66ab97cd1f5c9a3497d/raw/5233c792af49c9b78f20c35d5cd729e1307a7df7/breast-cancer.csv"

Retrieve header data

In [3]:
header_file = requests.get(header_url)
header_file_data = header_file.text
headers = header_file_data.split("\n")

In [None]:
headers

Load data and attach headers

In [4]:
df = pd.read_csv(data_url, header=None)
df.columns = headers

In [None]:
df

In [None]:
df.isna().any()

**Compute mean and median smoothness and compactness for benign and malignant tumors**

In [5]:
diagnosis_mean_median = df.groupby(['diagnosis']).agg({'smoothness_mean':['mean', 'median'],
                                                       'compactness_mean':['mean', 'median']})

In [None]:
diagnosis_mean_median

**Generate bootstrap samples of data**

In [6]:
def generateBootstrapSamples(df, size):
    sample = df.sample(replace=True, n=size)
    return sample

Unnamed: 0,ID,diagnosis,radius_mean,radius_sd_error,radius_worst,texture_mean,texture_sd_error,texture_worst,perimeter_mean,perimeter_sd_error,...,concavity_worst,concave_points_mean,concave_points_sd_error,concave_points_worst,symmetry_mean,symmetry_sd_error,symmetry_worst,fractal_dimension_mean,fractal_dimension_sd_error,fractal_dimension_worst
475,911654,B,14.2,20.53,92.41,618.4,0.08931,0.1108,0.05063,0.03058,...,16.45,27.26,112.1,828.5,0.1153,0.3429,0.2512,0.1339,0.2534,0.07858
36,854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,...,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
29,853401,M,18.63,25.11,124.8,1088.0,0.1064,0.1887,0.2319,0.1244,...,23.15,34.01,160.5,1670.0,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
140,869104,M,16.11,18.05,105.1,813.0,0.09721,0.1137,0.09447,0.05943,...,19.92,25.27,129.0,1233.0,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
488,913535,M,16.69,20.2,107.1,857.6,0.07497,0.07112,0.03649,0.02307,...,19.18,26.56,127.3,1084.0,0.1009,0.292,0.2477,0.08737,0.4677,0.07623
37,855133,M,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,...,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504
242,88411702,B,13.75,23.77,88.54,590.0,0.08043,0.06807,0.04697,0.02344,...,15.01,26.34,98.0,706.0,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321
527,918192,B,13.94,13.17,90.31,594.2,0.1248,0.09755,0.101,0.06615,...,14.62,15.38,94.52,653.3,0.1394,0.1364,0.1559,0.1015,0.216,0.07253
545,922577,B,10.32,16.35,65.31,324.9,0.09434,0.04994,0.01012,0.005495,...,11.25,21.77,71.12,384.9,0.1285,0.08842,0.04384,0.02381,0.2681,0.07399
417,906024,B,12.7,12.17,80.88,495.0,0.08785,0.05794,0.0236,0.02402,...,13.65,16.92,88.12,566.9,0.1314,0.1607,0.09385,0.08224,0.2775,0.09464


In [None]:
generateBootstrapSamples(df, 5)

In [7]:
df.loc[df["diagnosis"]=='M', "tumor"] = 1
df.loc[df["diagnosis"]=='B', "tumor"] = 0

**Calculate correlations**

In [8]:
breast_corr = df.loc[:, df.columns.difference(['ID', 'diagnosis'])]

In [None]:
breast_corr

In [66]:
breat_corr_sorted = breast_corr.corr(method='pearson')['tumor'].sort_values(ascending=False).head(10)

In [67]:
breat_corr_sorted

tumor                      1.000000
fractal_dimension_mean     0.793181
concave_points_sd_error    0.782470
perimeter_sd_error         0.776369
concavity_worst            0.775725
radius_worst               0.741887
concave_points_worst       0.732925
radius_mean                0.729376
texture_mean               0.708389
perimeter_mean             0.695648
Name: tumor, dtype: float64

Splitting dataset

In [91]:
X = breast_corr[["fractal_dimension_mean", "concave_points_sd_error", "perimeter_sd_error"]]

In [92]:
Y = breast_corr["tumor"]

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

Feature Scaling

In [94]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

**Logistic Regression**

In [95]:
logisticReg = LogisticRegression(random_state=0)
logisticReg.fit(X_train, Y_train)

LogisticRegression(random_state=0)

In [96]:
Y_pred = logisticReg.predict(X_test)

In [79]:
Y_pred

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
       1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1.])

In [97]:
cm = confusion_matrix(Y_test, Y_pred)
cm

array([[88,  4],
       [ 7, 43]], dtype=int64)

In [98]:
score = metrics.accuracy_score(Y_test, Y_pred)
score

0.9225352112676056

**Random Forest Classifier**

In [99]:
rfc = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0)
rfc.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [100]:
Y_pred = rfc.predict(X_test)

In [101]:
cm = confusion_matrix(Y_test, Y_pred)
cm

array([[88,  4],
       [ 8, 42]], dtype=int64)

In [102]:
score = metrics.accuracy_score(Y_test, Y_pred)
score

0.9154929577464789

### Advantages vs. Disadvantages 

**Logistic Regression**

Advantages
1. Simple to implement
2. Effective

Disadvantages
1. Poor performance on non-linear data (eg: image data)
2. Poor performance with irrelevant and highly correlated features


**Random Forest Classifier**

Advantages
1. Able to handle huge amount of data with higher dimesionality of variables
2. Reduced error. Takes inputs from all the trees and then predicts the outcome, which ensures that the individual errors of trees are minimized and overall variance and error is reduced

Disadvantages
1. Predictions of the trees need to be uncorrelated
2. Features need to have some predictive power else they don't work 




# Part 2: Feedback

**Student Sample 1 (Before)**

In [104]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
from sklearn import LinearRegression
from sklearn.cross_validation import cross_val_score

# Load data
d = pd.read_csv('../data/train.csv')


# Setup data for prediction
x1 = data.SalaryNormalized
x2 = pd.get_dummies(data.ContractType)

# Setup model
model = LinearRegression()

# Evaluate model
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
scores = cross_val_score(model, x2, x1, cv=1, scoring='mean_absolute_error')
print(scores.mean())

ImportError: cannot import name 'LinearRegression' from 'sklearn' (C:\Users\yapmi\anaconda3\lib\site-packages\sklearn\__init__.py)

Debug: LinearRegression import error

In [None]:
from sklearn.linear_model import LinearRegression

Debug: cross_val_score import error 

In [None]:
from sklearn.model_selection import cross_val_score

Debug: 'data' not defined

In [None]:
data = pd.read_csv('../data/train.csv') # Rename 'd' to 'data'

Debug: train_test_split import error

In [None]:
from sklearn.model_selection import train_test_split

Debug: 'mean_absolute_error' is not a valid scoring value

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
# Change to 'neg_mean_absolute_error'
scores = cross_val_score(model, x2, x1, cv=1, scoring='neg_mean_absolute_error')

Debug: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1

In [None]:
# Change 'cv' value to 2
scores = cross_val_score(model, x2, x1, cv=2, scoring='neg_mean_absolute_error')

**Student Sample 1 (After)**

In [118]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('../data/train.csv')


# Setup data for prediction
x1 = data.SalaryNormalized
x2 = pd.get_dummies(data.ContractType)

# Setup model
model = LinearRegression()

# Evaluate model
scores = cross_val_score(model, x2, x1, cv=2, scoring='neg_mean_absolute_error')
print(scores.mean())

-11733.827883047155



**Student Sample 2 (Before)**

In [119]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

# Load data
data = pd.read_csv('../data/train.csv')


# Setup data for prediction
y = data.SalaryNormalized
X = pd.get_dummies(data.ContractType)

# Setup model
model = LinearRegression()

# Evaluate model
scores = cross_val_score(model, X, y, cv=5, scoring='mean_absolute_error')
print(scores.mean())

ModuleNotFoundError: No module named 'sklearn.cross_validation'

Debug: cross_val_score import error 

In [None]:
from sklearn.model_selection import cross_val_score

Debug: 'mean_absolute_error' is not a valid scoring value

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
# Change to 'neg_mean_absolute_error'
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')


**Student Sample 2 (After)**

In [121]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Load data
data = pd.read_csv('../data/train.csv')


# Setup data for prediction
y = data.SalaryNormalized
X = pd.get_dummies(data.ContractType)

# Setup model
model = LinearRegression()

# Evaluate model
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(scores.mean())

-11822.140231295069
