In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, train_test_split, RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer

 

def evaluate_model(X, y, model):
#     cv = KFold(n_splits=4, random_state=42)
#     scorer = make_scorer(f1_score, average='weighted')
#     scores = cross_val_score(model, X, y, scoring=scorer, cv=cv, n_jobs=-1)
#     return scores

    features_train, features_test, target_train, target_test = train_test_split(
        X, y, random_state=42, test_size=0.25)
    
    model.fit(features_train, target_train)
    predicted = model.predict(features_test)

    return f1_score(predicted, target_test, average='weighted')
 
def get_models():
    models, names, grids = list(), list(), list()
    
    #     n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = list(range(1, 16, 4))
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]

    grid_rfc = {
#         'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap,
    }
    
    models.append(RandomForestClassifier(n_estimators=50, random_state=42))
    names.append('RF')
    grids.append(grid_rfc)
    

    grid_linearDiscriminant = {
        'solver': ['svd', 'lsqr', 'eigen'],
        'shrinkage': np.arange(0, 1, 0.01)
    }
    
    grids.append(grid_linearDiscriminant)
    models.append(LinearDiscriminantAnalysis())
    names.append('LDA')
    
    
    grid_logistic = {
        'penalty' : ['l1', 'l2'],
        'C' : [0.1, 1, 10],
        'class_weight' : ['balanced', None],
    }
    
    grids.append(grid_logistic)
    models.append(LogisticRegression(random_state=42))
    names.append('LR')
    
    
    depth_range = range(1, 10)
    leaf_range = range(1,15)

    grid_dtc = {
        'max_depth': depth_range,
        'min_samples_leaf': leaf_range
    }
    
    grids.append(grid_dtc)
    models.append(DecisionTreeClassifier(min_samples_split=10, random_state=42))
    names.append('DT')
    
    return models, names, grids

def model_grid_search(X, y, model, grid_params):
    features_train, features_test, target_train, target_test = train_test_split(
        X, y, random_state=42, test_size=0.25)
        
    scorer = make_scorer(f1_score, average='weighted')
    grid_search = RandomizedSearchCV(estimator=model, param_distributions=grid_params, cv=3, scoring=scorer, error_score=0)
    grid_result = grid_search.fit(features_train, target_train)
    best_estimator = grid_result.best_estimator_
    
    predicted = best_estimator.predict(features_test)
    
    score = f1_score(predicted, target_test, average='weighted')

    return [score, best_estimator, predicted]
 


In [2]:
data = pd.read_csv('new_data.csv')
data = data.set_index('hash_inn')
data = data[:40000]

In [3]:
train_data = data[data['okved2'] != -1]
test_data = data[data['okved2'] == -1]

test_data = test_data.drop(['okved2'], axis=1)

features = train_data.reset_index().drop(['okved2', 'hash_inn'], axis=1)
target = train_data['okved2']

In [7]:
features

Unnamed: 0,"('count', 0)","('count', 1)","('count', 2)","('count', 3)","('count', 4)","('count', 5)","('count', 6)","('count', 7)","('count', 8)","('count', 9)",...,"('week', 81).1","('week', 82).1","('week', 83).1","('week', 84).1","('week', 85).1","('week', 86).1","('week', 87).1","('week', 88).1","('week', 89).1","('week', 90).1"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.107103,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.391959,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,2.378141,0.0,0.0,0.0,0.0,0.0
26871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
26872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
26873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


In [8]:
features_train, features_test, target_train, target_test = train_test_split(
    features, target, random_state=42, test_size=0.25)

### BASE LINE

In [9]:
f1_score([12 for _ in range(len(target_test))], target_test, average='weighted')

0.4414289028067734

In [10]:
model = LogisticRegression(random_state=42)
model.fit(features_train, target_train)
predicted = model.predict(features_test)

f1_score(predicted, target_test, average='weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.4225461327750963

In [11]:
pd.Series(predicted).value_counts()

12    6152
34     284
14      89
52      27
4       23
20      16
60      16
39      13
57       9
24       6
55       6
26       6
59       5
40       5
67       5
8        5
11       4
6        4
47       4
7        4
22       3
68       3
61       3
78       3
21       3
75       3
56       2
65       2
41       2
3        2
27       2
76       1
9        1
29       1
53       1
10       1
62       1
31       1
0        1
dtype: int64

### Tuning

In [12]:
models, names, grids = get_models()

In [13]:
results = dict()

In [14]:
for i in range(len(models)):
    if names[i] not in results:
        scores = evaluate_model(features, target, models[i])
        results[names[i]] = scores
    else:
        scores = results[names[i]]
    print('>%s %.3f' % (names[i], np.mean(scores)))


>RF 0.290
>LDA 0.373


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


>LR 0.423
>DT 0.188


In [15]:
results

{'RF': 0.28973419496341385,
 'LDA': 0.3732831793174768,
 'LR': 0.4225461327750963,
 'DT': 0.18805789043948729}

In [16]:
results_grid = {}

In [17]:
for i in range(4):
    model, grid, name = models[i], grids[i], names[i]

    if name not in results_grid:
        scores, best_estimator, predicted = model_grid_search(features, target, model, grid)
        results_grid[name] = [scores, best_estimator, predicted]
    else:
        scores, best_estimator, predicted = results_grid[name]

    print('>%s %.3f' % (name, np.mean(scores)))

>RF 0.437


Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 463, in fit
    raise NotImplementedError('shrinkage not supported')
NotImplementedError: shrinkage not supported

Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 463, in fit
    raise NotImplementedError('shrinkage not supported')
NotImplementedError: shrinkage not supported

Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_scor

Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 463, in fit
    raise NotImplementedError('shrinkage not supported')
NotImplementedError: shrinkage not supported

Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 463, in fit
    raise NotImplementedError('shrinkage not supported')
NotImplementedError: shrinkage not supported

Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_scor

>LDA 0.418


Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/magleb/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 

>LR 0.422
>DT 0.413


In [18]:
results_grid

{'RF': [0.4365369835642605,
  RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=50,
                         random_state=42),
  array([14, 12, 12, ..., 12, 12, 12])],
 'LDA': [0.41757785073492354,
  LinearDiscriminantAnalysis(shrinkage=0.81, solver='lsqr'),
  array([12, 12, 12, ..., 12, 12, 12])],
 'LR': [0.42173671396232815,
  LogisticRegression(C=0.1, random_state=42),
  array([12, 12, 12, ..., 12, 12, 12])],
 'DT': [0.41342095927566097,
  DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42),
  array([14, 12, 12, ..., 52, 12, 12])]}

In [28]:
model = RandomForestClassifier(max_features='sqrt', min_samples_leaf=15,
                         min_samples_split=4, n_estimators=200, random_state=42)
model.fit(features_train, target_train)
predicted = model.predict(features_test)
f1_score(predicted, target_test, average='weighted')

0.44384112549375554

### Prediction

In [3]:
data = pd.read_csv('new_data.csv')
data = data.set_index('hash_inn')

train_data = data[data['okved2'] != -1]
test_data = data[data['okved2'] == -1]

test_data = test_data.drop(['okved2'], axis=1)

features = train_data.reset_index().drop(['okved2', 'hash_inn'], axis=1)
target = train_data['okved2']

In [4]:
features

Unnamed: 0,"('count', 0)","('count', 1)","('count', 2)","('count', 3)","('count', 4)","('count', 5)","('count', 6)","('count', 7)","('count', 8)","('count', 9)",...,"('week', 81).1","('week', 82).1","('week', 83).1","('week', 84).1","('week', 85).1","('week', 86).1","('week', 87).1","('week', 88).1","('week', 89).1","('week', 90).1"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.107103,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.391959,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.070286,0.000000,0.0,0.0,0.0,0.0,0.0
161411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
161412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.062019,0.000000,0.0,0.0,0.0,0.0,0.0
161413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,3.976119,0.0,0.0,0.0,0.0,0.0


In [5]:
test_data

Unnamed: 0_level_0,"('count', 0)","('count', 1)","('count', 2)","('count', 3)","('count', 4)","('count', 5)","('count', 6)","('count', 7)","('count', 8)","('count', 9)",...,"('week', 81).1","('week', 82).1","('week', 83).1","('week', 84).1","('week', 85).1","('week', 86).1","('week', 87).1","('week', 88).1","('week', 89).1","('week', 90).1"
hash_inn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.000000,5.772733,0.0,5.322906,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
260487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.195624,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
260500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
260514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


In [6]:
model = RandomForestClassifier(max_features='sqrt', min_samples_leaf=15,
                         min_samples_split=4, n_estimators=70, random_state=42)
model.fit(features, target)
predicted = model.predict(test_data)
predicted

array([12, 12, 12, ..., 12, 12, 12])

In [9]:
test_data.index

Int64Index([     0,      2,      4,      6,      9,     12,     15,     19,
                22,     38,
            ...
            260475, 260476, 260482, 260483, 260484, 260485, 260487, 260500,
            260514, 260515],
           dtype='int64', name='hash_inn', length=78654)

In [11]:
resulted_df = pd.DataFrame(index=test_data.index, data=predicted)
resulted_df = resulted_df.reset_index()
resulted_df.columns = ['hash_inn', 'okved2']
resulted_df

Unnamed: 0,hash_inn,okved2
0,0,12
1,2,12
2,4,12
3,6,12
4,9,12
...,...,...
78649,260485,12
78650,260487,12
78651,260500,12
78652,260514,12


In [15]:
resulted_df.to_csv('MaksimovGleb-SberbankIndustry.csv', index=False)