In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([[42, 'male', 12, 'reading', 'class2'],
[35, 'unknown', 3, 'cooking', 'class1'],
[1000, 'female', 7, 'cycling', 'class3'],
[1000, 'unknown', 21, 'unknown', 'unknown']
])

df.columns = ['age',
'gender',
'month_birth',
'hobby',
'target']

In [3]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42,male,12,reading,class2
1,35,unknown,3,cooking,class1
2,1000,female,7,cycling,class3
3,1000,unknown,21,unknown,unknown


In [4]:
df['age'].unique()


array([  42,   35, 1000], dtype=int64)

In [5]:
df['gender'].unique()


array(['male', 'unknown', 'female'], dtype=object)

In [6]:
df['month_birth'].unique()

array([12,  3,  7, 21], dtype=int64)

In [7]:
df['hobby'].unique()

array(['reading', 'cooking', 'cycling', 'unknown'], dtype=object)

In [8]:
df['target'].unique()

array(['class2', 'class1', 'class3', 'unknown'], dtype=object)

In [9]:
df.loc[df['age']>150, ['age']] = np.nan
df.loc[df['gender']=='unknown', ['gender']] = np.nan
df.loc[df['month_birth']>12, ['month_birth']] = np.nan
df.loc[df['hobby']=='unknown', ['hobby']] = np.nan
df.loc[df['target'] == 'unknown', ['target']] = np.nan

In [10]:
df

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3
3,,,,,


In [11]:
df.isnull().sum()

age            2
gender         2
month_birth    1
hobby          1
target         1
dtype: int64

In [12]:
df2 = df.dropna(axis = 0)
df2

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2


In [13]:
df3 = df.dropna(axis = 1)
df3

0
1
2
3


In [14]:
df4 = df.dropna(how='all')
df4

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [15]:
df5 = df.dropna(thresh=2)
df5

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,,3.0,cooking,class1
2,,female,7.0,cycling,class3


In [16]:
df6 = df.dropna(subset=['gender'])
df6

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
2,,female,7.0,cycling,class3


In [17]:
alter_values = {'age':0, 'gender':'U', 'month_birth':0, 'hobby':'U', 'target':'class4'}
df7 = df.fillna(value = alter_values)
df7

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [18]:
from sklearn.preprocessing import LabelEncoder
df8 = df7
class_label = LabelEncoder()
data_value = df8['target'].values
y_new = class_label.fit_transform(data_value)
y_new

array([1, 0, 2, 3])

In [19]:
df8['target'] = y_new
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,1
1,35.0,U,3.0,cooking,0
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [20]:
y_ori = class_label.inverse_transform(y_new)
y_ori

array(['class2', 'class1', 'class3', 'class4'], dtype=object)

In [21]:
df8['target'] = y_ori
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,class2
1,35.0,U,3.0,cooking,class1
2,0.0,female,7.0,cycling,class3
3,0.0,U,0.0,U,class4


In [22]:
y_arr = df8['target'].values
y_arr.sort()
y_arr

array(['class1', 'class2', 'class3', 'class4'], dtype=object)

In [23]:
num_y = 0
dic_y = {}
for ith_y in y_arr:
    dic_y[ith_y] = num_y
    num_y += 1

#del dic_y['class4']
dic_y

{'class1': 0, 'class2': 1, 'class3': 2, 'class4': 3}

In [24]:
df8 = df7
df8['target'] = df8['target'].replace(dic_y)
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,0
1,35.0,U,3.0,cooking,1
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [25]:
df8['target'] = [1, 0, 2, 3]
df8

Unnamed: 0,age,gender,month_birth,hobby,target
0,42.0,male,12.0,reading,1
1,35.0,U,3.0,cooking,0
2,0.0,female,7.0,cycling,2
3,0.0,U,0.0,U,3


In [26]:
df9 = df8
df9['target'] = df9['target'].astype(str)
df10 = pd.get_dummies(df9['target'])
print(df10)

   0  1  2  3
0  0  1  0  0
1  1  0  0  0
2  0  0  1  0
3  0  0  0  1


In [27]:
df9['target'] = df9['target'].astype(str)
df11 = pd.get_dummies(df9['target'], drop_first=True)
print(df11)

   1  2  3
0  1  0  0
1  0  0  0
2  0  1  0
3  0  0  1


In [28]:
df12 = df8
df13 = pd.get_dummies(df12)
df13

Unnamed: 0,age,month_birth,gender_U,gender_female,gender_male,hobby_U,hobby_cooking,hobby_cycling,hobby_reading,target_0,target_1,target_2,target_3
0,42.0,12.0,0,0,1,0,0,0,1,0,1,0,0
1,35.0,3.0,1,0,0,0,1,0,0,1,0,0,0
2,0.0,7.0,0,1,0,0,0,1,0,0,0,1,0
3,0.0,0.0,1,0,0,1,0,0,0,0,0,0,1


In [29]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder = OneHotEncoder()
y = df7[['target']]
y_hot = hot_encoder.fit_transform(y)
print(y_hot.toarray())

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [30]:
from tensorflow.keras.utils import to_categorical
y_hotec = to_categorical(y)
print(y_hotec)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [31]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std.fit(df8[['month_birth']])
#x_std = std.transform(df8[['month_birth']])
#x_std
np.set_printoptions(precision = 4)
x_std2 = std.fit_transform(df8[['month_birth']])
x_std2

array([[ 1.4444],
       [-0.5556],
       [ 0.3333],
       [-1.2222]])

In [32]:
np.set_printoptions(suppress=True)
np.mean(x_std2)
np.std(x_std2)


1.0

In [33]:
from sklearn.preprocessing import RobustScaler

robust = RobustScaler()
x_robust = robust.fit_transform(df8[['month_birth']])
x_robust

array([[ 1.1667],
       [-0.3333],
       [ 0.3333],
       [-0.8333]])

In [34]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
x_minmax = minmax.fit_transform(df8[['month_birth']])
x_minmax

array([[1.    ],
       [0.25  ],
       [0.5833],
       [0.    ]])

In [35]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
x_norm = normalizer.fit_transform(df8[['age', 'month_birth']])
x_norm

array([[0.9615, 0.2747],
       [0.9963, 0.0854],
       [0.    , 1.    ],
       [0.    , 0.    ]])

In [36]:
arr = np.array([0.9615, 0.2747])
np.linalg.norm(arr, 2)
np.linalg.norm(arr, 1)

1.2362

In [38]:
### non-Pipeline
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn .metrics import mean_squared_error

# Data loading
raw_boston = datasets.load_boston()

x = raw_boston.data
y = raw_boston.target

# Training / Test data division
X_tn, X_te, Y_tn, Y_te = train_test_split(x, y, random_state = 7)

# Standard scaling
std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

# learning
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, Y_tn)

# Prediction
pred_linear = clf_linear.predict(X_te_std)

# Evaluation
mean_squared_error(Y_te, pred_linear)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

29.5151377901976

In [41]:
### Pipeline applied

from sklearn.pipeline import Pipeline
# Training / Test data division
X_tn, X_te, Y_tn, Y_te = train_test_split(x, y, random_state = 7)

linear_pipeline = Pipeline([('scaler', StandardScaler()), ('linear_regression', LinearRegression())])

# Learning
linear_pipeline.fit(X_tn, Y_tn)
pred_linear = linear_pipeline.predict(X_te)

# Evaluation
mean_squared_error(Y_te, pred_linear)


29.5151377901976

In [43]:
### Grid search

### New imports
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### New dataset
raw_iris = datasets.load_iris()

# loading feature and target
X = raw_iris.data
Y = raw_iris.target

# Dividing training / test data
X_tn, X_te, Y_tn, Y_te = train_test_split(X, Y, random_state=7)

std_scale = StandardScaler()
std_scale.fit(X_tn)

X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)


### K-NN neibors algorithm - K optimization
best_accuracy = 0

for k in range(1, 11):
    clf_knn = KNeighborsClassifier(n_neighbors=k)
    clf_knn.fit(X_tn, Y_tn)
    knn_pred = clf_knn.predict(X_te_std)
    accuracy = accuracy_score(Y_te, knn_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        final_k = {'k':k}
        
        
        
print(final_k)
print(accuracy)

{'k': 1}
0.2894736842105263
