## ☯ Step1 : Load & prepare the dataset (e.g., feature scaling, categorical features, etc.) 

### 1. Read file

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv ('hw1_std.csv')

df

Unnamed: 0,target,age,sex,num1,num2,num3,num4,num5,ord1,ord2,ord3,ord4,ord5,ord6
0,-1,69,1,160.0,286.0,108.0,1.5,2.0,0.0,0.0,0.0,1.0,1,3
1,-1,69,1,120.0,229.0,129.0,2.6,3.0,,,,1.0,1,2
2,-1,64,0,140.0,268.0,160.0,3.6,2.0,0.0,0.0,0.0,0.0,0,2
3,-1,65,1,130.0,254.0,147.0,1.4,3.0,0.0,0.0,0.0,0.0,1,1
4,-1,53,M,140.0,203.0,155.0,3.1,3.0,0.0,1.0,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1,56,M,120.0,240.0,169.0,0.0,2.0,1.0,,1.0,,0,0
299,1,55,M,132.0,342.0,166.0,1.2,2.0,1.0,,1.0,,2,0
300,1,41,M,120.0,157.0,182.0,0.0,2.0,1.0,,1.0,,2,0
301,1,38,M,138.0,175.0,173.0,0.0,2.0,2.0,,1.0,,2,4


In [17]:
df.var()

target       0.995345
age         95.971368
num1       304.037860
num2      2673.904337
num3       517.635440
num4         1.356066
num5         0.371274
ord1         1.065554
ord2         0.131177
ord3         0.276518
ord4         0.224862
ord5         0.379735
ord6         1.045724
dtype: float64

### 2. Clean Data 

#### 2.1. Check null value in each column

In [18]:
df.isnull().sum()

target     0
age        0
sex        0
num1       3
num2       3
num3       3
num4       3
num5       2
ord1       1
ord2      12
ord3       1
ord4      11
ord5       0
ord6       0
dtype: int64

#### 2.2. Replace all missing values ' Nan ' with ' 0 '

In [19]:
df.fillna(value=0, inplace=True)

df

Unnamed: 0,target,age,sex,num1,num2,num3,num4,num5,ord1,ord2,ord3,ord4,ord5,ord6
0,-1,69,1,160.0,286.0,108.0,1.5,2.0,0.0,0.0,0.0,1.0,1,3
1,-1,69,1,120.0,229.0,129.0,2.6,3.0,0.0,0.0,0.0,1.0,1,2
2,-1,64,0,140.0,268.0,160.0,3.6,2.0,0.0,0.0,0.0,0.0,0,2
3,-1,65,1,130.0,254.0,147.0,1.4,3.0,0.0,0.0,0.0,0.0,1,1
4,-1,53,M,140.0,203.0,155.0,3.1,3.0,0.0,1.0,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1,56,M,120.0,240.0,169.0,0.0,2.0,1.0,0.0,1.0,0.0,0,0
299,1,55,M,132.0,342.0,166.0,1.2,2.0,1.0,0.0,1.0,0.0,2,0
300,1,41,M,120.0,157.0,182.0,0.0,2.0,1.0,0.0,1.0,0.0,2,0
301,1,38,M,138.0,175.0,173.0,0.0,2.0,2.0,0.0,1.0,0.0,2,4


#### 2.3. Re-check null value in each column

In [20]:
df.isnull().sum()

target    0
age       0
sex       0
num1      0
num2      0
num3      0
num4      0
num5      0
ord1      0
ord2      0
ord3      0
ord4      0
ord5      0
ord6      0
dtype: int64

#### 2.4. Find duplicate rows and eliminate duplicate row

In [21]:
df[df.duplicated(keep=False)==True]

df.drop_duplicates(keep='first')

Unnamed: 0,target,age,sex,num1,num2,num3,num4,num5,ord1,ord2,ord3,ord4,ord5,ord6
0,-1,69,1,160.0,286.0,108.0,1.5,2.0,0.0,0.0,0.0,1.0,1,3
1,-1,69,1,120.0,229.0,129.0,2.6,3.0,0.0,0.0,0.0,1.0,1,2
2,-1,64,0,140.0,268.0,160.0,3.6,2.0,0.0,0.0,0.0,0.0,0,2
3,-1,65,1,130.0,254.0,147.0,1.4,3.0,0.0,0.0,0.0,0.0,1,1
4,-1,53,M,140.0,203.0,155.0,3.1,3.0,0.0,1.0,0.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1,56,M,130.0,221.0,163.0,0.0,3.0,1.0,0.0,0.0,0.0,2,0
298,1,56,M,120.0,240.0,169.0,0.0,2.0,1.0,0.0,1.0,0.0,0,0
299,1,55,M,132.0,342.0,166.0,1.2,2.0,1.0,0.0,1.0,0.0,2,0
300,1,41,M,120.0,157.0,182.0,0.0,2.0,1.0,0.0,1.0,0.0,2,0


### 3. Data Preprocessing
#### 3.1 convert all variable in 'sex' column to become type of number (F,M --> 0,1)

In [22]:
df['sex'] = df['sex'].replace({'F': '0'}) 
df['sex'] = df['sex'].replace({'M': '1'}) 

print(df)

     target  age sex   num1   num2   num3  num4  num5  ord1  ord2  ord3  ord4  \
0        -1   69   1  160.0  286.0  108.0   1.5   2.0   0.0   0.0   0.0   1.0   
1        -1   69   1  120.0  229.0  129.0   2.6   3.0   0.0   0.0   0.0   1.0   
2        -1   64   0  140.0  268.0  160.0   3.6   2.0   0.0   0.0   0.0   0.0   
3        -1   65   1  130.0  254.0  147.0   1.4   3.0   0.0   0.0   0.0   0.0   
4        -1   53   1  140.0  203.0  155.0   3.1   3.0   0.0   1.0   0.0   1.0   
..      ...  ...  ..    ...    ...    ...   ...   ...   ...   ...   ...   ...   
298       1   56   1  120.0  240.0  169.0   0.0   2.0   1.0   0.0   1.0   0.0   
299       1   55   1  132.0  342.0  166.0   1.2   2.0   1.0   0.0   1.0   0.0   
300       1   41   1  120.0  157.0  182.0   0.0   2.0   1.0   0.0   1.0   0.0   
301       1   38   1  138.0  175.0  173.0   0.0   2.0   2.0   0.0   1.0   0.0   
302       1   38   1  138.0  175.0  173.0   0.0   2.0   2.0   0.0   1.0   0.0   

     ord5  ord6  
0       1

### 4. Feature scaling 

#### scaled on 'target'

In [23]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
df['label'] = label_enc.fit_transform(df['target'])

df

Unnamed: 0,target,age,sex,num1,num2,num3,num4,num5,ord1,ord2,ord3,ord4,ord5,ord6,label
0,-1,69,1,160.0,286.0,108.0,1.5,2.0,0.0,0.0,0.0,1.0,1,3,0
1,-1,69,1,120.0,229.0,129.0,2.6,3.0,0.0,0.0,0.0,1.0,1,2,0
2,-1,64,0,140.0,268.0,160.0,3.6,2.0,0.0,0.0,0.0,0.0,0,2,0
3,-1,65,1,130.0,254.0,147.0,1.4,3.0,0.0,0.0,0.0,0.0,1,1,0
4,-1,53,1,140.0,203.0,155.0,3.1,3.0,0.0,1.0,0.0,1.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1,56,1,120.0,240.0,169.0,0.0,2.0,1.0,0.0,1.0,0.0,0,0,1
299,1,55,1,132.0,342.0,166.0,1.2,2.0,1.0,0.0,1.0,0.0,2,0,1
300,1,41,1,120.0,157.0,182.0,0.0,2.0,1.0,0.0,1.0,0.0,2,0,1
301,1,38,1,138.0,175.0,173.0,0.0,2.0,2.0,0.0,1.0,0.0,2,4,1


In [24]:
df = df.drop(columns=['target'])

df

Unnamed: 0,age,sex,num1,num2,num3,num4,num5,ord1,ord2,ord3,ord4,ord5,ord6,label
0,69,1,160.0,286.0,108.0,1.5,2.0,0.0,0.0,0.0,1.0,1,3,0
1,69,1,120.0,229.0,129.0,2.6,3.0,0.0,0.0,0.0,1.0,1,2,0
2,64,0,140.0,268.0,160.0,3.6,2.0,0.0,0.0,0.0,0.0,0,2,0
3,65,1,130.0,254.0,147.0,1.4,3.0,0.0,0.0,0.0,0.0,1,1,0
4,53,1,140.0,203.0,155.0,3.1,3.0,0.0,1.0,0.0,1.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,56,1,120.0,240.0,169.0,0.0,2.0,1.0,0.0,1.0,0.0,0,0,1
299,55,1,132.0,342.0,166.0,1.2,2.0,1.0,0.0,1.0,0.0,2,0,1
300,41,1,120.0,157.0,182.0,0.0,2.0,1.0,0.0,1.0,0.0,2,0,1
301,38,1,138.0,175.0,173.0,0.0,2.0,2.0,0.0,1.0,0.0,2,4,1


In [25]:
X = df.drop(columns=['label']).values
y = df['label'].values

print(X.shape, y.shape)

(303, 13) (303,)


## ☯ Step2 : Train/Valid/Test split or Cross-validation 
### Train/Valid/Test split

In [26]:
from sklearn.model_selection import train_test_split

# Split the data into the training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    random_state = 42,
    test_size = 0.2)  # try to change this number 80:20

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (242, 13), (242,)
Test set: (61, 13), (61,)


## ☯ Step3 : Feature selection ☯

In [27]:
#I selected 'target' to be my feature

## ☯ Step4 : Model Selection ☯
### 1. Random Forest Model

In [28]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#print y_prediction
print(y_pred)

[1 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1
 0 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1]


### 2. Visualization Random Forest Model

In [29]:
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################
#########################################################

##  ☯ Step5 : Evaluation (e.g., accuracy, precision, etc.)
### Accuracy

In [30]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8032786885245902


# ʕ•́ᴥ•̀ʔっ  QUESTION & ANSWER  ʕ•́ᴥ•̀ʔっ

### 1. Why do you choose these features? Any preprocessing techniques you apply? If so, why?

I choose 'target' to be my features because it's the most clearly separate a types of people. I apply preprocessing technique by convert all variable in 'sex' column to become a same type of data (F,M --> 0,1) in step 3.1. Because number type is mostly easy to use in the future. 

### 2. Which approach do you choose to evaluate your model: (1) Train/Valid/Test split or (2) cross-validation? Why?

I choose Train/Valid/Test split to evaluate my model because it's easy for me to understand and use to evaluate a model

### 3. Why do you choose this model (e.g., decision tree, logistic regression, etc.)? How do you come up with the hyperparameters for the model (e.g., tree depth, etc.)?

### 4. What are the performance metrics that you use to evaluate your model? Why?