In [3]:
import pandas as pd
import numpy as np
import json
from shapely.geometry import shape
from sklearn.neighbors import KDTree

df = pd.read_csv('train_dataset_train.csv')
df['x'] = df['.geo'].apply(lambda geo: shape(json.loads(geo)).centroid.x)
df['y'] = df['.geo'].apply(lambda geo: shape(json.loads(geo)).centroid.y)
nd_threshold = 0.29
for c in filter(lambda col: col[:2] == 'nd', df.columns):
    df[f'{c}>{nd_threshold}'] = df[c] > nd_threshold
query = KDTree(df[['x', 'y']]).query(df[['x', 'y']], k=3)
for i in range(2):
    neighbor1h = pd.get_dummies(df['crop'].values[query[1][:, i + 1]], prefix='neighbor')
    for j in range(7):
        df[f'neighbor_{i}_crop_{j}'] = neighbor1h[f'neighbor_{j}'] / query[0][:, i + 1]
for i in range(7):
    df[f'distance_to_crop_{i}'] = KDTree(df.loc[df['crop'] == i, ['x', 'y']]).query(df[['x', 'y']], k=2)[0][:, 1]
    
df.shape

(4830, 167)

In [4]:
X_train = df.drop(columns=['id', '.geo', 'area', 'crop'])
y_train = df['crop']
X_train.shape

(4830, 163)

In [5]:
from sklearn.base import TransformerMixin

class FeatureSelector(TransformerMixin):
    def __init__(self, features):
        self.features = features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.features]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

model = StackingClassifier(
    estimators=[
        ('lr', Pipeline([
            ('scaler', StandardScaler()),
            ('model', LogisticRegression(C=0.6, penalty='l1', solver='liblinear', random_state=42))
        ])),
        ('svc', Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVC(C=6, random_state=42))
        ])),
        ('naive_gauss', Pipeline([
            ('scaler', StandardScaler()),
            ('model', GaussianNB(var_smoothing=0.1))
        ])),
        ('naive_multinominal', Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', MultinomialNB(alpha=0.4))
        ])),
        ('naive_bernoulli', Pipeline([
            ('fs', FeatureSelector(list(filter(lambda col: col[:2] == 'nd' or 'neighbor' in col, X_train.columns)))),
            ('scaler', MinMaxScaler()),
            ('model', BernoulliNB(binarize=0.3, alpha=1e-5))
        ])),
        ('random_forest', RandomForestClassifier(n_estimators=300, random_state=42)),
        ('mlp', Pipeline([
            ('scaler', StandardScaler()),
            ('model', MLPClassifier(random_state=42))
        ])),
        ('extra_trees', ExtraTreesClassifier(n_estimators=500, random_state=42)),
        ('catboost', CatBoostClassifier(n_estimators=300, depth=6, random_state=42, verbose=False)),
        ('lightgbm_gbdt', Pipeline([
            ('fs', FeatureSelector(list(filter(lambda col: 'neighbor_1' not in col, X_train.columns)))),
            ('model', LGBMClassifier(random_state=42))
        ])),
        ('lightgbm_dart', LGBMClassifier(boosting_type='dart', n_estimators=300, random_state=42)),
        ('lightgbm_goss', Pipeline([
            ('fs', FeatureSelector(list(filter(lambda col: 'neighbor_1' not in col, X_train.columns)))),
            ('model', LGBMClassifier(boosting_type='goss', num_leaves=10, learning_rate=0.3, random_state=42))
        ])),
        ('knn', Pipeline([
            ('scaler', StandardScaler()),
            ('model', KNeighborsClassifier(n_neighbors=8, weights='distance', p=1))
        ]))
    ],
    final_estimator=CatBoostClassifier(depth=2, random_state=42, verbose=False)
)

model

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score

def cv(model, X, y):
    kf = StratifiedKFold(shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X, y):
        model.fit(X.iloc[train_index], y.iloc[train_index])
        scores.append(recall_score(y.iloc[test_index], model.predict(X.iloc[test_index]), average="macro", zero_division=0))
    return np.mean(scores)

In [14]:
cv(model, X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.9837393822681324

In [15]:
model.fit(X_train, y_train)

In [16]:
df_test = pd.read_csv('test_dataset_test.csv')
df_test['x'] = df_test['.geo'].apply(lambda geo: shape(json.loads(geo)).centroid.x)
df_test['y'] = df_test['.geo'].apply(lambda geo: shape(json.loads(geo)).centroid.y)
nd_threshold = 0.29
for c in filter(lambda col: col[:2] == 'nd', df_test.columns):
    df_test[f'{c}>{nd_threshold}'] = df_test[c] > nd_threshold
query = KDTree(df[['x', 'y']]).query(df_test[['x', 'y']], k=2)
for i in range(2):
    neighbor1h = pd.get_dummies(df['crop'].values[query[1][:, i]], prefix='neighbor')
    for j in range(7):
        df_test[f'neighbor_{i}_crop_{j}'] = neighbor1h[f'neighbor_{j}'] / query[0][:, i]
for i in range(7):
    df_test[f'distance_to_crop_{i}'] = KDTree(df.loc[df['crop'] == i, ['x', 'y']]).query(df_test[['x', 'y']], k=1)[0][:, 0]
    
df_test.shape

(2071, 166)

In [17]:
X_test = df_test.drop(columns=['id', '.geo', 'area'])
X_test.shape

(2071, 163)

In [18]:
prediction = pd.read_csv('sample_solution.csv')
prediction['crop'] = model.predict(X_test)
prediction.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,id,crop
0,611,3
1,6417,2
2,3352,3
3,4224,1
4,3102,6


In [19]:
prediction.to_csv('final_prediction.csv', index=False)