In [22]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier,VotingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import os

#### Đọc dữ liệu 

In [23]:
data = pd.read_csv("train.csv").reset_index()
data.shape

(2000, 22)

 
- battery_power : tổng năng lượng mà pin có thể lưu trữ (mAh)
- blue : hổ trợ bluetooth (0-1)
- clock_speed : tốc độ mà bộ vi xử lý thực hiện các lệnh
- dual_sim : hổ trợ hai sim (0-1)
- fc : camera trước (mega pixel)
- four_g : hổ trợ 4G (0-1)
- int_memory : bộ nhớ trong (Gb)
- m_deep : chiều sâu của máy (cm)
- mobile_wt : trọng lượng của máy (g)

In [24]:
data.head()

Unnamed: 0,index,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,842,0,2.2,0,1,0,7,0.6,188,...,20,756,2549,9,7,19,0,0,1,1
1,1,1021,1,0.5,1,0,1,53,0.7,136,...,905,1988,2631,17,3,7,1,1,0,2
2,2,563,1,0.5,1,2,1,41,0.9,145,...,1263,1716,2603,11,2,9,1,1,0,2
3,3,615,1,2.5,0,0,0,10,0.8,131,...,1216,1786,2769,16,8,11,1,0,0,2
4,4,1821,1,1.2,0,13,1,44,0.6,141,...,1208,1212,1411,8,2,15,1,1,0,1


In [25]:
# Categorical features
cat_vals = ["blue", "dual_sim", "four_g", "wifi", "touch_screen", "three_g"] 
# Numerical features
num_vals = ["battery_power", "clock_speed", "fc", "int_memory", "m_dep", "mobile_wt",
            "n_cores", "pc", "px_height", "px_width", "ram",
            'sc_h', 'sc_w', 'talk_time'
           ]

In [26]:
# features
X = data[data.columns[1:-1]]
# target
Y = data[["price_range"]]

In [27]:
# Categorical features
X_cat = X[cat_vals]
# Numerical features
X_num = X[num_vals]

## Principal component analysis (Phân tích thành phần chính)

#### PCA - Categorical


In [28]:
pca = PCA(n_components=6)
pca.fit(X_cat)
X_transf= pca.transform(X_cat)

#### PCA - Numerical

In [29]:
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)
pca = PCA(n_components=14)
X_transf= pca.fit_transform(X_num_scaled)

In [30]:
pca = PCA(n_components=6)
X_transf= pca.fit_transform(X_num)

In [31]:
pca = PCA(n_components=4)
X_transf= pca.fit_transform(X_num)
data["X_PCA1"] = X_transf[:, 0]
data["X_PCA2"] = X_transf[:, 1]
data["X_PCA3"] = X_transf[:, 2]
data["X_PCA4"] = X_transf[:, 3]

In [32]:
# feature is now made of the four components of the PCA and the categorical values
features = ["X_PCA1", "X_PCA2", "X_PCA3", "X_PCA4"] + cat_vals
X_training = data[features]
X_training.head()

Unnamed: 0,X_PCA1,X_PCA2,X_PCA3,X_PCA4,blue,dual_sim,four_g,wifi,touch_screen,three_g
0,430.597105,-795.788278,-390.070331,55.636198,0,0,0,1,0,0
1,504.984729,696.622407,-235.629077,343.925952,1,1,1,0,1,1
2,473.329822,763.942174,-680.059464,-113.916908,1,1,1,0,1,1
3,639.822326,779.691218,-630.783687,-30.402214,1,0,0,0,0,1
4,-718.985171,382.304561,591.04032,-392.357215,1,0,1,0,1,1


In [33]:
# Chia dữ liệu ra Train - Test 67% 33%
X_train, X_test, y_train, y_test = train_test_split(X_training, Y, test_size=0.33,random_state=0)

In [34]:
# Change y shape for RandomForestClassifier
# ravel dàn phẳng từ mảng hai chiều sang mảng một chiều
y_train_fit = y_train.to_numpy().ravel()
print(y_train)
print(y_train_fit)

      price_range
295             1
364             0
862             2
1751            1
1403            2
...           ...
835             3
1216            1
1653            3
559             0
684             1

[1340 rows x 1 columns]
[1 0 2 ... 3 0 1]


## Random Forest (với tham số mặc định)

In [35]:
clf_RDF = RandomForestClassifier(random_state=0)
clf_RDF.fit(X_train, y_train_fit) # là một kiểu đối tượng RandomForestClassifier

RandomForestClassifier(random_state=0)

In [36]:
y_pred = clf_RDF.predict(X_test)
rs = accuracy_score(y_test,y_pred)
print(rs)

0.9045454545454545


 ##  Grid Search ( đi tìm siêu tham số)


In [37]:
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[None, 2, 5, 10, 15], "min_samples_split":(2, 3, 4) }
X_train, X_test, y_train, y_test = train_test_split(X_training, Y, test_size=0.33,random_state=0)
y_train_fit = y_train.to_numpy().ravel()
model = RandomForestClassifier(random_state=0)
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train_fit)
print(clf.best_params_)
print(clf.best_score_)

{'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 2}
0.8843283582089553


## Random Forest với siêu tham số

In [38]:
model = RandomForestClassifier(random_state=0,criterion=clf.best_params_['criterion'], max_depth= clf.best_params_['max_depth'], min_samples_split=clf.best_params_['min_samples_split'])
X_train, X_test, y_train, y_test = train_test_split(X_training, Y, test_size=0.33,random_state=0)
y_train_fit = y_train.to_numpy().ravel()
model.fit(X_train, y_train_fit)
y_pred = model.predict(X_test)
# print(model.score(X_test,y_test))
rs = accuracy_score(y_test,y_pred)
print(rs)

0.9045454545454545


## Chuẩn hóa dữ liệu 

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_training, Y, test_size=0.33,random_state=0)
y_train_fit = y_train.to_numpy().ravel()
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestClassifier(random_state=0, criterion=clf.best_params_['criterion'], max_depth= clf.best_params_['max_depth'], min_samples_split=clf.best_params_['min_samples_split'])
model.fit(X_train, y_train_fit)
print(model.score(X_test,y_test))

0.906060606060606
