Для выполнения работы взят Gender Classification Dataset: https://www.kaggle.com/elakiricoder/gender-classification-dataset

Определена задача классификации - определение гендера человека по некоторым признакам.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('gender_classification.csv')
data

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,Female
4997,1,11.9,5.4,0,0,0,0,Female
4998,1,12.9,5.7,0,0,0,0,Female
4999,1,13.2,6.2,0,0,0,0,Female


In [2]:
from sklearn import preprocessing
number = preprocessing.LabelEncoder()
data['gender']= number.fit_transform(data.gender)
data

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0
2,0,11.8,6.3,1,1,1,1,1
3,0,14.4,6.1,0,1,1,1,1
4,1,13.5,5.9,0,0,0,0,0
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,0
4997,1,11.9,5.4,0,0,0,0,0
4998,1,12.9,5.7,0,0,0,0,0
4999,1,13.2,6.2,0,0,0,0,0


In [3]:
X = data.drop(columns=['gender'])
Y = data['gender']
Y = np.array(Y, dtype=np.float)
X = np.array(X, dtype=np.float)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=4)

# Линейная регрессия

In [4]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [5]:
class LinearRegression_(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs, lr):
        self.lr = lr
        self.epochs = epochs
                
    def loss(self, y_pred, y):
        return np.sum(np.square(y_pred-y))/(2*self.N)
    
    def predict(self, X):
        return X.dot(self.w)+self.b
            
    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        self.b = 0
        self.N = X.shape[0]   
        
        for i in range(self.epochs):
            y_pred = np.dot(X,self.w) + self.b
            dw = np.dot(X.T,(y_pred-y)) / self.N
            db = np.sum(y - y_pred)  / self.N
            self.w = self.w - self.lr*dw
            self.b = self.b - self.lr*db
            loss = self.loss(y_pred, y)

In [6]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[('lin', LinearRegression_(10, 0.01))])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'lin', 'lin__epochs', 'lin__lr'])

In [7]:
parameters_grid = {
    'lin__epochs': [10, 15, 20, 50, 100],
    'lin__lr': [0.001, 0.01, 0.1, 1],
}

grid_cv = GridSearchCV(pipe, parameters_grid,scoring = 'neg_mean_squared_error')
grid_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('lin',
                                        LinearRegression_(epochs=10,
                                                          lr=0.01))]),
             param_grid={'lin__epochs': [10, 15, 20, 50, 100],
                         'lin__lr': [0.001, 0.01, 0.1, 1]},
             scoring='neg_mean_squared_error')

In [8]:
grid_cv.best_params_

{'lin__epochs': 100, 'lin__lr': 0.001}

In [9]:
grid_cv.best_score_

-0.21262185971490447

In [10]:
import joblib
joblib.dump(grid_cv.best_estimator_, "lin.plk")

['lin.plk']

In [11]:
y_pred = grid_cv.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMS: ", mean_squared_error(y_test, y_pred, squared=False))

MSE:  0.2121535002004107
MAE:  0.4592512403577702
RMS:  0.46060123773217404


In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred.round())

array([[403,  77],
       [ 66, 455]])

In [14]:
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(X_train, y_train)
print("MSE: ", mean_squared_error(y_test, lin.predict(X_test)))
print("MAE: ", mean_absolute_error(y_test, lin.predict(X_test)))
print("RMS: ", mean_squared_error(y_test, lin.predict(X_test), squared=False))

MSE:  0.03994022299855651
MAE:  0.14737544554039292
RMS:  0.1998505016219787


In [15]:

confusion_matrix(y_test, lin.predict(X_test).round())

array([[465,  15],
       [ 22, 499]])

# Метод опорных векторов

In [16]:
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [17]:
class SVMM(BaseEstimator, ClassifierMixin):
    def __init__(self, lr=1, epochs=1000):
        self.lr = lr        
        self.epochs = epochs
        
    def predict(self, X):
        y_pred = np.dot(X, self.w)
        return np.where(y_pred > 0, 1, 0)

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        y_ = np.where(y > 0, 1, -1)

        for e in range(self.epochs):
            for i, x in enumerate(X):
                a = y_[i] * (np.dot(x, self.w))
                if a < 1:
                    self.w += self.lr * (X[i]*y_[i] - 2/self.epochs*self.w)
                else:
                    self.w += self.lr * (-2/self.epochs*self.w)
    

In [18]:
pipe = Pipeline(steps=[('svm', SVMM())])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'svm', 'svm__epochs', 'svm__lr'])

In [19]:
parameters_grid = {
    'svm__lr': [0.001, 0.01, 0.1],    
    'svm__epochs': [20, 50]    
}

grid_cv = GridSearchCV(pipe, parameters_grid)
grid_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('svm', SVMM())]),
             param_grid={'svm__epochs': [20, 50],
                         'svm__lr': [0.001, 0.01, 0.1]})

In [20]:
grid_cv.best_params_

{'svm__epochs': 20, 'svm__lr': 0.001}

In [21]:
grid_cv.best_score_

0.9577500000000001

In [22]:
joblib.dump(grid_cv.best_estimator_, "svm.plk")

['svm.plk']

In [23]:
y_pred = grid_cv.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMS: ", mean_squared_error(y_test, y_pred, squared=False))
print("Roc_auc:  ", roc_auc_score(y_test, y_pred))

MSE:  0.04195804195804196
MAE:  0.04195804195804196
RMS:  0.20483662259967567
Roc_auc:   0.9566598688419705


In [24]:
confusion_matrix(y_test, y_pred)

array([[443,  37],
       [  5, 516]])

In [25]:
from sklearn.svm import SVC

pipe1 = Pipeline(steps=[('svm', SVC())])
pipe1.fit(X_train, y_train)

Pipeline(steps=[('svm', SVC())])

In [26]:
y_pred = pipe1.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMS: ", mean_squared_error(y_test, y_pred, squared=False))
print("Roc_auc:  ", roc_auc_score(y_test, y_pred))

MSE:  0.03196803196803197
MAE:  0.03196803196803197
RMS:  0.17879606250706967
Roc_auc:   0.9682241682661549


In [27]:
confusion_matrix(y_test, y_pred)

array([[467,  13],
       [ 19, 502]])

# KNN

In [28]:
from scipy.stats import mode

class KNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k = 4):
        self.k = k
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def predict(self, x):
        y_pred = []
    
        for nbr in x: 
            distances = []
            for j in range(len(self.X)): 
                distances.append(np.sum(np.absolute(np.array(self.X[j,:]) - nbr))) 
            
            distances = np.array(distances) 
            nearest_x = np.argsort(distances)[:self.k] 
            nearest_y = self.y[nearest_x]

            ans = mode(nearest_y)
            ans = ans.mode[0]
            y_pred.append(ans)

        return y_pred


In [29]:
pipe = Pipeline(steps=[('knn', KNN())])
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'knn', 'knn__k'])

In [30]:
parameters_grid = {
    'knn__k': [2, 5, 7, 10]  
}

grid_cv = GridSearchCV(pipe, parameters_grid)
grid_cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('knn', KNN())]),
             param_grid={'knn__k': [2, 5, 7, 10]})

In [31]:
grid_cv.best_params_

{'knn__k': 7}

In [32]:
grid_cv.best_score_

0.96975

In [33]:
joblib.dump(grid_cv.best_estimator_, "knn.plk")

['knn.plk']

In [34]:
y_pred = grid_cv.predict(X_test)

In [35]:
y_pred = pipe1.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMS: ", mean_squared_error(y_test, y_pred, squared=False))
print("Roc_auc:  ", roc_auc_score(y_test, y_pred))

MSE:  0.03196803196803197
MAE:  0.03196803196803197
RMS:  0.17879606250706967
Roc_auc:   0.9682241682661549


In [36]:
confusion_matrix(y_test, y_pred)

array([[467,  13],
       [ 19, 502]])

In [37]:
from sklearn.neighbors import KNeighborsClassifier

pipe1 = Pipeline(steps=[('knn_', KNeighborsClassifier(n_neighbors=10))])
pipe1.fit(X_train, y_train)
y_pred = pipe1.predict(X_test)

In [38]:
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("RMS: ", mean_squared_error(y_test, y_pred, squared=False))
print("Roc_auc:  ", roc_auc_score(y_test, y_pred))

MSE:  0.028971028971028972
MAE:  0.028971028971028972
RMS:  0.17020878053446295
Roc_auc:   0.971431142034549


In [39]:
confusion_matrix(y_test, y_pred)

array([[471,   9],
       [ 20, 501]])