In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from typing import Iterable
from itertools import chain
from pyspark.sql.types import StructType
from pyspark import SparkContext
import pandas as pd
from lightgbm import LGBMClassifier
import neptune

In [None]:
## Инициализируем проект в нептуне
neptune.init('iliaavilov/Zindi-insurance')

In [None]:
spark = SparkSession.builder.master("local[*]").config("spark.driver.cores", 16).appName('zindi-insurance').getOrCreate()

In [None]:
spark.sparkContext.uiWebUrl

In [None]:
random_state = 555

# Загрузка данных

In [None]:
train = spark.read.csv('train_prepared.csv', header = True)
test = spark.read.csv('test_prepared.csv', header = True)

In [None]:
def encoding(train, test, column):
    indexer = StringIndexer(inputCol = column, outputCol = '{}_indexed'.format(column))
    indexer = indexer.fit(train.union(test))


    train = indexer.transform(train)
    train = train.drop(column)
    train = train.withColumnRenamed('{}_indexed'.format(column), column)


    test = indexer.transform(test)
    test = test.drop(column)
    test = test.withColumnRenamed('{}_indexed'.format(column), column)
    
    return(train, test)

# Информация о продуктах, которые уже есть у пользователя

In [None]:
## ВЫбираем продукты, которые уже есть у пользователей
real_1 = test[test['presence'] == '1'][['ID', 'variable']]
real_1 = real_1.withColumn('Label', lit(1))

In [None]:
## Оставляем в тестовом датасете только продукты, информации о наличии которых нет
test = test[test['presence'] == '0']
ID_X_var = test[['ID', 'variable']]

In [None]:
train, test = encoding(train, test, 'variable')

# Разбиваем на X и y

In [None]:
y_train = train['presence']
for col in ['presence', 'ID']:

    train = train.drop(col)
    test = test.drop(col)

# Делаем предсказания

In [None]:
def predicting(X_train, y_train, X_test,  model, ID_X_var, real_1):
    
    model.fit(X_train, y_train)
    probas = model.predict_proba(X_test).T[1]
    
    submission = pd.DataFrame({
        'ID X PCODE' : ID_X_var['ID'].values + ' X ' + ID_X_var['variable'].values,
        'Label': probas})
    submission = submission.append(pd.DataFrame(
        {
        'ID X PCODE' : real_1['ID'].values + ' X ' + real_1['variable'].values,
        'Label': real_1['Label']}))
    submission.reset_index(drop = True, 
                           inplace = True)
    submission.to_csv('submission.csv', 
                      index = False)
    
    return(submission)

In [None]:
base_params =  {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_jobs': -1,
    'n_estimators': 1500,
    'random_state': random_state,
    'categorical_feature': [X_train.columns.get_loc(cat_col) for cat_col in 
                            ['sex', 'marital_status', 'branch_code', 'occupation_code',
                             'occupation_category_code', 'variable', 'P5DA',
                             'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9',
                             'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW',
                             'GHYX', 'ECY3']]}

best_params = ast.literal_eval(neptune.project.get_experiments('ZIN-912')[0].get_properties()['best_parameters'])
parameters = {**base_params, **best_params}

In [None]:
predicting(X_train.toPandas(), 
           y_train.toPandas(), 
           test.toPandas(),  
           LGBMClassifier(**parameters), 
           ID_X_var.toPandas(), 
           real_1.toPandas())

In [None]:
predicting(X_train, y_train, test,  LGBMClassifier(**parameters), ID_X_var, real_1)