## Import

In [None]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Regression models
from sklearn.linear_model import LinearRegression # OLS
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

# Evaluation : Classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

# Evaluation : Regression

# Hyperparams tuning
from sklearn.model_selection import GridSearchCV

# Pipeline

# Classification

### Data understanding

#### Data loading

In [None]:
column_name = ['Class','age','menopause','tumor-size',
           'inv-nodes','node-caps','deg-malig',
           'breast','breast-quad','irradiat']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data',
                 header=None,names=column_name)

#### Looking at the data

In [None]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
df.sample(5)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
50,no-recurrence-events,50-59,lt40,15-19,0-2,no,2,left,left_low,no
270,recurrence-events,50-59,ge40,30-34,6-8,yes,3,left,right_low,no
121,no-recurrence-events,50-59,ge40,15-19,0-2,no,2,right,right_up,no
203,recurrence-events,50-59,ge40,35-39,0-2,no,2,left,left_low,no
230,recurrence-events,50-59,premeno,50-54,9-11,yes,2,right,left_up,no


#### Dataset information

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [None]:
df.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

#### Descriptive statistics

In [None]:
# Default: numeric
df.describe()

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [None]:
# For object dtypes
df.describe(include=['O'])

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,breast,breast-quad,irradiat
count,286,286,286,286,286,286,286,286,286
unique,2,6,3,11,7,3,2,6,2
top,no-recurrence-events,50-59,premeno,30-34,0-2,no,left,left_low,no
freq,201,96,150,60,213,222,152,110,218


## Train test split

In [None]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [None]:
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [None]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: Class, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

## Data preparation

### Missing value

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X_train)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

### Feature engineering

In [None]:
X_encoder = OneHotEncoder(handle_unknown='ignore')
X_encoder.fit(X_train)

X_train = X_encoder.transform(X_train)
X_test = X_encoder.transform(X_test)

### Feature selection

In [None]:
#var_sel = VarianceThreshold()
#var_sel.fit(X_train)

#X_train = var_sel.transform(X_train)
#X_test = var_sel.transform(X_test)

In [None]:
kbest_sel = SelectKBest(chi2, k=10)
kbest_sel.fit(X_train,y_train)

X_train = kbest_sel.transform(X_train)
X_test = kbest_sel.transform(X_test)

## Train models

In [None]:
logreg = LogisticRegression()
svc = SVC()
forest = RandomForestClassifier()

logreg.fit(X_train, y_train)
svc.fit(X_train, y_train)
forest.fit(X_train, y_train)

y_logreg = logreg.predict(X_test)
y_svc = svc.predict(X_test)
y_forest = forest.predict(X_test)

## Model evaluation

In [None]:
logreg.score(X_train, y_train)

0.8037383177570093

In [None]:
svc.score(X_train, y_train)

0.8177570093457944

In [None]:
forest.score(X_train, y_train)

0.8457943925233645

In [None]:
print(classification_report(y_logreg, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.94      0.68      0.79        68
   recurrence-events       0.04      0.25      0.07         4

            accuracy                           0.65        72
           macro avg       0.49      0.46      0.43        72
        weighted avg       0.89      0.65      0.75        72



In [None]:
print(classification_report(y_svc, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.94      0.69      0.79        67
   recurrence-events       0.09      0.40      0.14         5

            accuracy                           0.67        72
           macro avg       0.51      0.54      0.47        72
        weighted avg       0.88      0.67      0.75        72



In [None]:
print(classification_report(y_svc, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.94      0.69      0.79        67
   recurrence-events       0.09      0.40      0.14         5

            accuracy                           0.67        72
           macro avg       0.51      0.54      0.47        72
        weighted avg       0.88      0.67      0.75        72



## Cross validation

In [None]:
cross_val_score(svc,X_train,y_train,cv=10)

array([0.77272727, 0.81818182, 0.81818182, 0.63636364, 0.71428571,
       0.76190476, 0.80952381, 0.85714286, 0.71428571, 0.80952381])

In [None]:
cross_validate(svc,X_train,y_train,cv=3)

{'fit_time': array([0.00484657, 0.00381875, 0.00379944]),
 'score_time': array([0.00110841, 0.00125337, 0.00108695]),
 'test_score': array([0.76388889, 0.74647887, 0.73239437])}

## Parameter Tuning

In [None]:
# Default model
svc.score(X_train, y_train)

0.8177570093457944

In [None]:
parameters = {
    'C' : (1.0,10.0),
    'kernel' : ('linear', 'poly', 'rbf', 'sigmoid')
}

clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)

0.7894795127353268
{'C': 10.0, 'kernel': 'poly'}


# Regression

In [None]:
import requests, zipfile, io

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip'
filename = 'student-mat.csv'

r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

import pandas as pd
df = pd.read_csv(filename, sep=';')
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


## Dataset info

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [None]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [None]:
df.describe(include='object')

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
count,395,395,395,395,395,395,395,395,395,395,395,395,395,395,395,395,395
unique,2,2,2,2,2,5,5,4,3,2,2,2,2,2,2,2,2
top,GP,F,U,GT3,T,other,other,course,mother,no,yes,no,yes,yes,yes,yes,no
freq,349,208,307,281,354,141,217,145,273,344,242,214,201,314,375,329,263


## Split dataset

In [None]:
df.drop(columns=['G1','G2'], inplace=True)

In [None]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,10


In [None]:
X = df.iloc[:,:-1]
X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,4
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,2
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,4


In [None]:
y = df.iloc[:,-1]
y.head()

0     6
1     6
2    10
3    15
4    10
Name: G3, dtype: int64

## Data preparation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(X_train)

X_train = onehot.transform(X_train)
X_test = onehot.transform(X_test)

## Build models

### OLS

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train,y_train)

In [None]:
linreg.score(X_train,y_train)

0.5047518707855236

### Ridge

In [None]:
rigreg = Ridge()
rigreg.fit(X_train, y_train)

rigreg.score(X_train, y_train)

0.4928772104763315

### Random Forest

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

forest_reg.score(X_train, y_train)

0.8793746292531975

## Lazy