In [29]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib import pyplot as plt

In [2]:
mobile_train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')

# Without using Pipeline

In [3]:
mobile_train.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
mobile_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [5]:
mobile_train['price_range'].unique()

array([1, 2, 3, 0])

In [6]:
mobile_train.isna().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

# No null values

In [7]:
mobile_train.duplicated().sum()
# No duplicate values

0

In [8]:
X = mobile_train.iloc[:,:20]
Y = mobile_train.iloc[:,20]
print(X.head())
print(Y.head())

   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  pc  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2   2         20       756  2549     9     7         19   
1        136        3   6        905      1988  2631    17     3          7   
2        145        5   6       1263      1716  2603    11     2          9   
3        131        6   9       1216      1786  2769    16     8         11   
4        141        2  14       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi  
0        0 

In [12]:
X.info()
print(Y)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [14]:
model = DecisionTreeClassifier(criterion = 'gini',max_depth = 10)
model.fit(X_train,Y_train)

DecisionTreeClassifier(max_depth=10)

In [15]:
Y_pred_dt = model.predict(X_test)

In [17]:
print('Accuracy score: ', metrics.accuracy_score(Y_test,Y_pred_dt))
print('Confusion matrix: ', metrics.confusion_matrix(Y_test,Y_pred_dt))

Accuracy score:  0.81
Confusion matrix:  [[133  18   0   0]
 [ 17 114  14   1]
 [  1  20 109  18]
 [  0   0  25 130]]


In [22]:
params = {'max_depth': [2,4,6,8,10,12],
         'min_samples_split': [2,3,4],
         'min_samples_leaf': [1,2]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf,param_grid=params)
gcv.fit(X_train,Y_train)
model = gcv.best_estimator_
model.fit(X_train,Y_train)
y_test_pred_preprun = model.predict(X_test)
print(f'Test score {metrics.accuracy_score(y_test_pred_preprun,Y_test)}')
print(f'Test Confusion matrix{metrics.confusion_matrix(y_test_pred_preprun,Y_test)}')

Test score 0.8283333333333334
Test Confusion matrix[[135  12   0   0]
 [ 16 122  20   0]
 [  0  11 111  26]
 [  0   1  17 129]]


In [23]:
print(gcv.best_estimator_)

DecisionTreeClassifier(max_depth=8, min_samples_leaf=2)


In [53]:
AdaBoost = AdaBoostClassifier(base_estimator = model, n_estimators = 200, learning_rate = 1)

In [54]:
AdaBoost.fit(X_train,Y_train)
Y_pred_ada = AdaBoost.predict(X_test)

In [55]:
print('Accuracy score: ', metrics.accuracy_score(Y_test,Y_pred_ada))

Accuracy score:  0.8933333333333333


# Using Pipeline

In [56]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

In [57]:
models = list()
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [58]:
lr = Pipeline([('m', LogisticRegression())])
models.append(('Logistic Regression', lr))


In [59]:
lr.fit(X_train,Y_train)
y_pred_lr = lr.predict(X_test)
metrics.accuracy_score(Y_test,y_pred_lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.635

In [60]:
svc = Pipeline([('m', SVC())])
models.append(('SVC', svc))

In [61]:
svc.fit(X_train,Y_train)
y_pred_svc = svc.predict(X_test)
metrics.accuracy_score(Y_test,y_pred_svc)

0.9533333333333334

In [62]:
from sklearn.neighbors import KNeighborsClassifier 

In [63]:
knnclassifier = Pipeline([('m', KNeighborsClassifier(n_neighbors=6, metric='minkowski', p=2 ))])
models.append(('KNNClassifier', knnclassifier))


In [64]:
knnclassifier.fit(X_train,Y_train)
y_pred_knn = knnclassifier.predict(X_test)
metrics.accuracy_score(Y_test,y_pred_knn)

0.9266666666666666

In [67]:
ensemble = VotingClassifier(estimators = models, voting = 'hard')

In [68]:
models

[('Logistic Regression', Pipeline(steps=[('m', LogisticRegression())])),
 ('SVC', Pipeline(steps=[('m', SVC())])),
 ('KNNClassifier',
  Pipeline(steps=[('m', KNeighborsClassifier(n_neighbors=6))]))]

In [69]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_val_score(ensemble, X_train, Y_train, scoring ='accuracy', cv = cv, n_jobs = -1)
n_scores

array([0.96428571, 0.925     , 0.93571429, 0.93928571, 0.93928571])

In [None]:
# For each split of training data into training and validation the accuracy is computed by taking the highest accuracy of one of the 3 models in ensemble pipeline

In [70]:
ensemble.fit(X_train,Y_train)
y_pred_ensemble = ensemble.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [71]:
metrics.accuracy_score(Y_test,y_pred_ensemble)

0.9333333333333333

In [73]:
mobile_test = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

In [74]:
mobile_test.drop(['id'], axis = 1, inplace = True)
print(mobile_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  1000 non-null   int64  
 1   blue           1000 non-null   int64  
 2   clock_speed    1000 non-null   float64
 3   dual_sim       1000 non-null   int64  
 4   fc             1000 non-null   int64  
 5   four_g         1000 non-null   int64  
 6   int_memory     1000 non-null   int64  
 7   m_dep          1000 non-null   float64
 8   mobile_wt      1000 non-null   int64  
 9   n_cores        1000 non-null   int64  
 10  pc             1000 non-null   int64  
 11  px_height      1000 non-null   int64  
 12  px_width       1000 non-null   int64  
 13  ram            1000 non-null   int64  
 14  sc_h           1000 non-null   int64  
 15  sc_w           1000 non-null   int64  
 16  talk_time      1000 non-null   int64  
 17  three_g        1000 non-null   int64  
 18  touch_scr

In [75]:
price_range = ensemble.predict(mobile_test)

In [76]:
# The final prediction is 
print(price_range)

[3 3 2 3 1 3 3 1 3 0 3 3 0 0 2 0 2 1 3 2 1 3 1 1 3 0 2 0 3 0 2 0 3 0 0 1 3
 1 2 1 1 2 0 0 0 1 0 3 1 2 1 0 3 0 3 1 3 1 1 3 3 3 0 1 1 1 1 3 1 2 1 2 2 3
 3 0 2 0 2 3 0 3 3 0 3 0 3 1 3 0 1 2 2 0 2 2 0 2 1 2 1 0 0 3 0 2 0 1 2 3 3
 3 1 3 3 3 3 1 3 0 0 3 2 1 2 0 3 2 3 1 0 2 1 1 3 1 1 0 3 2 1 2 1 2 2 3 3 3
 2 3 2 3 1 0 3 2 3 3 3 3 2 2 3 3 3 3 1 0 3 0 0 0 2 1 0 1 0 0 1 2 1 0 0 1 1
 2 2 1 0 0 0 1 0 3 1 0 2 2 3 3 1 2 2 2 3 2 2 1 0 0 1 2 0 2 3 3 0 2 0 3 2 3
 3 1 0 1 0 3 0 1 0 2 2 1 2 0 3 0 3 1 2 0 0 2 1 3 2 3 1 1 3 0 0 2 3 3 1 3 1
 1 3 2 1 2 3 3 3 1 0 0 2 3 2 1 3 2 0 3 0 0 2 0 0 3 2 3 3 2 1 3 3 2 3 1 2 1
 2 0 2 3 1 0 0 3 0 3 0 1 2 0 2 3 1 3 2 2 0 2 0 0 0 1 3 2 0 0 0 3 2 0 2 3 1
 2 2 2 3 1 3 3 2 2 2 3 3 0 3 0 3 1 3 1 3 3 0 1 0 3 1 3 2 3 0 0 0 0 2 0 0 2
 2 1 2 2 2 0 1 0 0 3 2 0 3 1 2 2 1 2 3 1 1 2 2 1 2 0 1 1 0 3 2 1 0 1 0 0 1
 1 0 1 0 2 2 3 2 3 0 3 0 3 0 1 1 1 2 0 3 2 3 3 1 3 1 3 1 3 2 0 1 2 1 1 0 0
 0 1 2 1 0 3 2 0 2 3 0 0 3 1 1 0 3 2 3 0 3 0 2 3 3 3 0 2 0 2 2 0 1 1 0 0 1
 1 1 3 3 3 2 3 1 1 2 3 3 

In [77]:
breastcancer = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [78]:
breastcancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [79]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(breastcancer['diagnosis'])
breastcancer['diagnosis_new'] = le.transform(breastcancer['diagnosis'])

In [80]:
breastcancer = breastcancer.drop(['diagnosis','Unnamed: 32','id'],axis = 1)

In [81]:
breastcancer.shape

(569, 31)

In [82]:
X = breastcancer.iloc[:,:30]
Y = breastcancer.iloc[:,30]

In [97]:
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [102]:
model = KNeighborsClassifier()
model.fit(X_train_,Y_train_)
print(Y_test_.shape)
Y_pred_knn = model.predict(X_test_)
print(Y_pred_knn.shape)

(114,)
(114,)


In [103]:
metrics.accuracy_score(Y_test_,Y_pred_knn)

0.956140350877193