In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np


In [3]:
from joblib import dump, load

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


In [5]:
PATH_DATA = './data/'
PATH_SUBMISSION = './submissions/'
PATH_MODEL = './models/'

In [6]:
test_df = pd.read_csv(PATH_DATA + 'test.csv')

In [7]:
gender_submission = pd.read_csv(PATH_DATA + 'gender_submission.csv')

In [8]:
def make_submission(y_pred, gender_submission, test_df, name):
    gender_submission.Survived = y_pred
    gender_submission.PassengerId = test_df.PassengerId
    gender_submission.to_csv(PATH_SUBMISSION + name, index=False)

In [9]:
class Classicator:
    def __init__(self, data_train, data_test,target_name):
        self.X = data_train.drop(target_name,axis=1)
        self.y = data_train[target_name]
        self.X_test = data_test
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(self.X,self.y, random_state=42)
        
        self.class_classicator = None
        self.simple_classificator = None
    
    def set_parametr_search(self, parametrs_search):
        self.parametrs_search = parametrs_search
    
    def set_class_searcher(self,class_searcher, cv, scoring, n_jobs):
        self.class_searcher = class_searcher
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        
    def set_class_classicator(self, class_classicator):
#       class_classicator - класс который будет использоваться для классификации 
        self.class_classicator = class_classicator
        
    def searh_best_classificator(self):
        clf = self.class_searcher(
            self.class_classicator(),
            param_grid=self.parametrs_search,
            cv=self.cv,
            scoring=self.scoring, 
            n_jobs=self.n_jobs
        )
        clf.fit(self.X, self.y)
        
        self.__print_report_searcher(clf)
        self.__print_report_classificator(clf)
        
        self.best_parametrs = clf.best_params_
        
    def make_best_classificator(self):
        model = self.class_classicator(**self.best_parametrs)
        model.fit(self.X_train, self.y_train)
        self.__print_report_classificator(model)
        self.best_classificator = model
    
    def make_simple_classificator(self):
        model = self.class_classicator()
        model.fit(self.X_train, self.y_train)
        self.__print_report_classificator(model)
        self.simple_classificator = model
    
    def make_final_classificator(self):
        model = self.class_classicator(**self.best_parametrs)
        model.fit(self.X, self.y)
        self.__print_report_classificator(model)
        self.final_classificator = model
    
    def make_predict_with_final_classificator(self):
        y_pred = self.final_classificator.predict(self.X_test)
        return y_pred
        
    def save_final_model(self, path_save, name):
        dump(self.final_classificator, PATH_MODEL + name)
        
    def __print_report_searcher(self,model):
        print('Best estimator')
        print('')
        print(model.best_estimator_)
        print('')
        print('Best parametrs')
        print('')
        print(model.best_params_)
        print('')
    
    def __print_report_classificator(self, model):
        y_pred = model.predict(self.X_valid)
        print(LogisticRegression.__name__)
        print(classification_report(self.y_valid, y_pred))
        

# KNN blanks

In [10]:
train_knn_df = pd.read_csv(PATH_DATA  +'train_knn_liner.csv')
test_knn_df = pd.read_csv(PATH_DATA  +'test_knn_liner.csv')
target_name = 'Survived'

In [11]:
classificator_knn_blanks = Classicator(train_knn_df, test_knn_df, target_name)

## Simple Classificator

In [12]:
classificator_knn_blanks.set_class_classicator(LogisticRegression)
classificator_knn_blanks.make_simple_classificator()

LogisticRegression
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       134
           1       0.78      0.78      0.78        89

    accuracy                           0.83       223
   macro avg       0.82      0.82      0.82       223
weighted avg       0.82      0.83      0.82       223



## Search best classificator

In [13]:
parameters = {
    'penalty':['l1', 'l2', 'none'],
    'C':np.arange(0,2,0.05),
}
parametrs_class_searher = {
    'class_searcher': GridSearchCV,
    'cv':5,
    'scoring':'precision',
    'n_jobs': 5
}
classificator_knn_blanks.set_parametr_search(parameters)
classificator_knn_blanks.set_class_searcher(**parametrs_class_searher)

In [14]:
classificator_knn_blanks.searh_best_classificator()

Best estimator

LogisticRegression(C=1.1500000000000001, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Best parametrs

{'C': 1.1500000000000001, 'penalty': 'l2'}

LogisticRegression
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       134
           1       0.83      0.81      0.82        89

    accuracy                           0.86       223
   macro avg       0.85      0.85      0.85       223
weighted avg       0.86      0.86      0.86       223



## Best classificator

In [15]:
classificator_knn_blanks.make_best_classificator()

LogisticRegression
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       134
           1       0.78      0.78      0.78        89

    accuracy                           0.82       223
   macro avg       0.81      0.81      0.81       223
weighted avg       0.82      0.82      0.82       223



## Final classificator

In [16]:
classificator_knn_blanks.make_final_classificator()

LogisticRegression
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       134
           1       0.83      0.81      0.82        89

    accuracy                           0.86       223
   macro avg       0.85      0.85      0.85       223
weighted avg       0.86      0.86      0.86       223



## Submission

In [17]:
y_pred = classificator_knn_blanks.make_predict_with_final_classificator()
make_submission(y_pred, gender_submission, test_df, 'submission_log_loss_knn.csv')

## Save model

In [18]:
classificator_knn_blanks.save_final_model(PATH_MODEL, 'log_loss_knn.joblib')

# IMP Blanks

In [19]:
train_imp_df = pd.read_csv(PATH_DATA  +'train_imp_liner.csv')
test_imp_df = pd.read_csv(PATH_DATA  +'test_imp_liner.csv')
target_name = 'Survived'

In [20]:
classificator_imp_blanks = Classicator(train_imp_df, test_imp_df, target_name)

## Simple Classificator

In [21]:
classificator_imp_blanks.set_class_classicator(LogisticRegression)
classificator_imp_blanks.make_simple_classificator()

LogisticRegression
              precision    recall  f1-score   support

           0       0.86      0.84      0.85       134
           1       0.77      0.79      0.78        89

    accuracy                           0.82       223
   macro avg       0.81      0.81      0.81       223
weighted avg       0.82      0.82      0.82       223



## Search best classificator

In [22]:
parameters = {
    'penalty':['l1', 'l2', 'none'],
    'C':np.arange(0,2,0.05),
}
parametrs_class_searher = {
    'class_searcher': GridSearchCV,
    'cv':5,
    'scoring':'precision',
    'n_jobs': 5
}
classificator_imp_blanks.set_parametr_search(parameters)
classificator_imp_blanks.set_class_searcher(**parametrs_class_searher)

In [23]:
classificator_imp_blanks.searh_best_classificator()

Best estimator

LogisticRegression(C=1.1500000000000001, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Best parametrs

{'C': 1.1500000000000001, 'penalty': 'l2'}

LogisticRegression
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       134
           1       0.83      0.81      0.82        89

    accuracy                           0.86       223
   macro avg       0.85      0.85      0.85       223
weighted avg       0.86      0.86      0.86       223



## Best classificator

In [None]:
classificator_imp_blanks.make_best_classificator()

## Final classificator

In [24]:
classificator_imp_blanks.make_final_classificator()

LogisticRegression
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       134
           1       0.83      0.81      0.82        89

    accuracy                           0.86       223
   macro avg       0.85      0.85      0.85       223
weighted avg       0.86      0.86      0.86       223



## Submission

In [None]:
y_pred = classificator_imp_blanks.make_predict_with_final_classificator()
make_submission(y_pred, gender_submission, test_df, 'submission_log_loss_imp.csv')

## Save model

In [None]:
classificator_imp_blanks.save_final_model(PATH_MODEL, 'log_loss_imp.joblib')