<a href="https://colab.research.google.com/github/Iftekhirul-kom/Data_Science_Projects/blob/main/typical_pipeline_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read Data

# The dataset crx.csv is available [here](https://drive.google.com/file/d/1St5_fkwdySSlNgeLthqeaOh1jATxnCj2/view?usp=sharing)
# The dataset post-operative.csv is available [here](https://drive.google.com/file/d/1yR8mLtACWjefoKam1XaCY1jVgA5tP0wT/view?usp=sharing)

In [None]:
import pandas as pd
from pandas.api.types import is_string_dtype

data = pd.read_csv('crx.csv', na_values='?')

data1 = pd.read_csv('post-operative.csv', na_values='?')

for col in data1.columns:
  if is_string_dtype(data1[col].dtype):
    data1[col] = data1[col].str.strip()

for col in data.columns:
  if is_string_dtype(data[col].dtype):
    data[col] = data[col].str.strip()

# Split Dataset

In [None]:
from sklearn.model_selection import train_test_split

x = data.loc[:, data.columns != 'Class']
y = data['Class']

x1 = data1.loc[:, data1.columns != 'Label']
y1 = data1['Label']

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, stratify=y, random_state=1)
train_x1, test_x1, train_y1, test_y1 = train_test_split(x1, y1, test_size=0.1, stratify=y1, random_state=1)

# Handle Missing Values (DROP)

In [None]:
train = pd.concat([train_x, train_y], axis=1)
test = pd.concat([test_x, test_y], axis=1)

train = train.dropna()
test = test.dropna()

print(train.isnull().values.any())

drop_train_x = train.loc[:, data.columns != 'Class']
drop_train_y = train['Class']
drop_test_x = test.loc[:, data.columns != 'Class']
drop_test_y = test['Class']

False


# Handle Missing Values (Imputation)

In [None]:
from sklearn.impute import SimpleImputer

cont_col = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

si_train_x = pd.DataFrame()
si_test_x = pd.DataFrame()

for col in train_x.columns:
  if is_string_dtype(train_x[col].dtype):
    si = SimpleImputer(strategy='most_frequent')
  else:
    si = SimpleImputer(strategy='mean')
  si.fit(train_x[[col]])
  si_train_x[col] = si.transform(train_x[[col]]).flatten()
  si_test_x[col] = si.transform(test_x[[col]]).flatten()

# Handle Continuous Feature (Binarization)

In [None]:
from sklearn.preprocessing import Binarizer

b_drop_train_x = pd.DataFrame()
b_drop_test_x = pd.DataFrame()

b_si_train_x = pd.DataFrame()
b_si_test_x = pd.DataFrame()

for col in train_x.columns:
  if col in cont_col:
    bin = Binarizer(threshold=drop_train_x[col].mean())
    b_drop_train_x[col] = bin.transform(drop_train_x[[col]]).flatten()
    b_drop_test_x[col] = bin.transform(drop_test_x[[col]]).flatten()

    bin = Binarizer(threshold=si_train_x[col].mean())
    b_si_train_x[col] = bin.transform(si_train_x[[col]]).flatten()
    b_si_test_x[col] = bin.transform(si_test_x[[col]]).flatten()
  else:
    b_drop_train_x[col] = drop_train_x[col].copy()
    b_drop_test_x[col] = drop_test_x[col].copy()
    b_si_train_x[col] = si_train_x[col].copy()
    b_si_test_x[col] = si_test_x[col].copy()

print(b_drop_train_x.isnull().values.any())

False


# Handle Continuous Features (Quantization)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

q_drop_train_x = pd.DataFrame()
q_drop_test_x = pd.DataFrame()

q_si_train_x = pd.DataFrame()
q_si_test_x = pd.DataFrame()

for col in train_x.columns:
  if col in cont_col:
    bin = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    bin.fit(drop_train_x[[col]])
    q_drop_train_x[col] = bin.transform(drop_train_x[[col]]).flatten()
    q_drop_test_x[col] = bin.transform(drop_test_x[[col]]).flatten()
    bin.fit(si_train_x[[col]])
    q_si_train_x[col] = bin.transform(si_train_x[[col]]).flatten()
    q_si_test_x[col] = bin.transform(si_test_x[[col]]).flatten()
  else:
    q_drop_train_x[col] = drop_train_x[col].copy()
    q_drop_test_x[col] = drop_test_x[col].copy()
    q_si_train_x[col] = si_train_x[col].copy()
    q_si_test_x[col] = si_test_x[col].copy()


# Handle Text Features

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(train_y)
train_y = le.transform(train_y)
test_y = le.transform(test_y)

le.fit(drop_train_y)
drop_train_y = le.transform(drop_train_y)
drop_test_y = le.transform(drop_test_y)

cont_col = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']

le_b_drop_train_x = pd.DataFrame()
le_b_drop_test_x = pd.DataFrame()

le_q_drop_train_x = pd.DataFrame()
le_q_drop_test_x = pd.DataFrame()

le_b_si_train_x = pd.DataFrame()
le_b_si_test_x = pd.DataFrame()

le_q_si_train_x = pd.DataFrame()
le_q_si_test_x = pd.DataFrame()

b_drop_train_x.reset_index(drop=True, inplace=True)
q_drop_train_x.reset_index(drop=True, inplace=True)
b_si_train_x.reset_index(drop=True, inplace=True)
q_si_train_x.reset_index(drop=True, inplace=True)

b_drop_test_x.reset_index(drop=True, inplace=True)
q_drop_test_x.reset_index(drop=True, inplace=True)
b_si_test_x.reset_index(drop=True, inplace=True)
q_si_test_x.reset_index(drop=True, inplace=True)

for col in train_x.columns:
  if col not in cont_col:
    le.fit(b_drop_train_x[[col]])
    le_b_drop_train_x[col] = le.transform(b_drop_train_x[col])
    le_b_drop_test_x[col] = le.transform(b_drop_test_x[col])

    le.fit(q_drop_train_x[[col]])
    le_q_drop_train_x[col] = le.transform(q_drop_train_x[[col]]).flatten()
    le_q_drop_test_x[col] = le.transform(q_drop_test_x[[col]]).flatten()

    le.fit(b_si_train_x[[col]])
    le_b_si_train_x[col] = le.transform(b_si_train_x[[col]]).flatten()
    le_b_si_test_x[col] = le.transform(b_si_test_x[[col]]).flatten()

    le.fit(q_drop_train_x[[col]])
    le_q_si_train_x[col] = le.transform(q_si_train_x[[col]]).flatten()
    le_q_si_test_x[col] = le.transform(q_si_test_x[[col]]).flatten()

  else:
    le_b_drop_train_x[col] = b_drop_train_x[col].copy()
    le_b_drop_test_x[col] = b_drop_test_x[col].copy()

    le_q_drop_train_x[col] = q_drop_train_x[col].copy()
    le_q_drop_test_x[col] = q_drop_test_x[col].copy()

    le_b_si_train_x[col] = b_si_train_x[col].copy()
    le_b_si_test_x[col] = b_si_test_x[col].copy()

    le_q_si_train_x[col] = q_si_train_x[col].copy()
    le_q_si_test_x[col] = q_si_test_x[col].copy()

print(le_b_drop_train_x)

     A1   A2   A3  A4  A5  A6  A7   A8  A9  A10  A11  A12  A13  A14  A15
0     0  1.0  1.0   1   0   8   3  1.0   1    1    1    1    0  0.0    0
1     0  0.0  1.0   2   2  13   3  0.0   0    0    0    1    0  0.0    0
2     0  1.0  1.0   1   0  10   7  0.0   1    1    1    1    0  0.0    0
3     1  1.0  0.0   1   0  10   7  1.0   1    0    0    0    2  0.0    0
4     1  0.0  1.0   1   0   1   7  1.0   1    1    1    0    0  0.0    1
..   ..  ...  ...  ..  ..  ..  ..  ...  ..  ...  ...  ...  ...  ...  ...
582   1  1.0  1.0   1   0  12   7  0.0   1    1    1    0    0  0.0    0
583   0  1.0  0.0   1   0   0   7  0.0   0    0    0    0    0  1.0    0
584   1  1.0  0.0   1   0   2   3  1.0   1    1    1    0    0  1.0    0
585   1  1.0  1.0   2   2   8   7  0.0   0    0    0    0    0  0.0    0
586   1  0.0  1.0   1   0  10   7  0.0   1    1    0    0    0  0.0    0

[587 rows x 15 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Decision Tree (Train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt1 = DecisionTreeClassifier(criterion='entropy')
dt2 = DecisionTreeClassifier(criterion='entropy')
dt3 = DecisionTreeClassifier(criterion='entropy')
dt4 = DecisionTreeClassifier(criterion='entropy')

print(le_b_drop_train_x)

dt1.fit(le_b_drop_train_x, drop_train_y)
dt2.fit(le_q_drop_train_x, drop_train_y)
dt3.fit(le_b_si_train_x, train_y)
dt4.fit(le_q_si_train_x, train_y)


     A1   A2   A3  A4  A5  A6  A7   A8  A9  A10  A11  A12  A13  A14  A15
0     0  1.0  1.0   1   0   8   3  1.0   1    1    1    1    0  0.0    0
1     0  0.0  1.0   2   2  13   3  0.0   0    0    0    1    0  0.0    0
2     0  1.0  1.0   1   0  10   7  0.0   1    1    1    1    0  0.0    0
3     1  1.0  0.0   1   0  10   7  1.0   1    0    0    0    2  0.0    0
4     1  0.0  1.0   1   0   1   7  1.0   1    1    1    0    0  0.0    1
..   ..  ...  ...  ..  ..  ..  ..  ...  ..  ...  ...  ...  ...  ...  ...
582   1  1.0  1.0   1   0  12   7  0.0   1    1    1    0    0  0.0    0
583   0  1.0  0.0   1   0   0   7  0.0   0    0    0    0    0  1.0    0
584   1  1.0  0.0   1   0   2   3  1.0   1    1    1    0    0  1.0    0
585   1  1.0  1.0   2   2   8   7  0.0   0    0    0    0    0  0.0    0
586   1  0.0  1.0   1   0  10   7  0.0   1    1    0    0    0  0.0    0

[587 rows x 15 columns]


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

# Decision Tree (Test)

In [None]:
from sklearn.metrics import accuracy_score

b_drop_predict = dt1.predict(le_b_drop_test_x)
q_drop_predict = dt2.predict(le_q_drop_test_x)
b_si_predict = dt3.predict(le_b_si_test_x)
q_si_predict = dt4.predict(le_q_si_test_x)

acc1 = accuracy_score(drop_test_y, b_drop_predict)
acc2 = accuracy_score(drop_test_y, q_drop_predict)
acc3 = accuracy_score(test_y, b_si_predict)
acc4 = accuracy_score(test_y, q_si_predict)

print(acc1, acc2, acc3, acc4)

0.8181818181818182 0.8333333333333334 0.8260869565217391 0.8260869565217391


In [None]:
from sklearn.svm import SVC

svm = SVC(random_state=1, verbose=1)
svm.fit(le_q_si_train_x, train_y)
predict_y = svm.predict(le_q_si_test_x)

acc = accuracy_score(test_y, q_si_predict)
print(acc)

[LibSVM]0.8260869565217391


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1, random_state=1, verbose=1)
et.fit(le_q_si_train_x, train_y)
predict_y = et.predict(le_q_si_test_x)

acc = accuracy_score(test_y, q_si_predict)
print(acc)