In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.sparse import load_npz
from sklearn.decomposition import PCA

In [29]:
X_tbl_train = pd.read_csv('train_df.csv').drop('AdoptionSpeed', axis=1)
X_tbl_test = pd.read_csv('test_df.csv')
X_img_train = pd.read_csv('train_img_df2.csv')
X_des_train = pd.DataFrame(load_npz('train_desc_df.npz').toarray())
X_des_test = pd.DataFrame(load_npz('test_desc_df.npz').toarray())
X_meta_train = pd.read_csv('train_meta_df.csv')
X_meta_test = pd.read_csv('test_meta_df.csv')
X_stm_train = pd.read_csv('train_stm_df.csv')
X_stm_test = pd.read_csv('test_stm_df.csv')
y_train = pd.read_csv('train_df.csv')['AdoptionSpeed'].astype('int')

In [3]:
# import pickle

# file_train = open('train_desc_df.pkl', 'rb')
# X_des_train = pickle.load(file_train)
# file_train.close()

# file_test = open('test_desc_df.pkl', 'rb')
# X_des_test = pickle.load(file_test)
# file_test.close()

In [3]:
X_meta_train.text_annot = X_meta_train.text_annot.astype('float32')
X_meta_test.text_annot = X_meta_test.text_annot.astype('float32')

In [26]:
X_tbl_train.shape, X_des_train.shape, X_meta_train.shape, X_stm_train.shape

((14993, 21), (14993, 12991), (14652, 4), (14442, 4))

In [30]:
X_train = pd.concat([X_tbl_train, X_des_train], axis=1)
X_train = pd.merge(X_train, X_meta_train, how='outer', on='PetID')
X_train = pd.merge(X_train, X_img_train, how='left', on='PetID')
X_train = pd.merge(X_train, X_stm_train, how='outer', on='PetID').drop('PetID', axis=1).fillna(0)

In [25]:
X_train.shape

(14993, 13142)

In [38]:
pca = PCA(n_components=50)

In [39]:
X_reduced_train = pca.fit_transform(X_train)
# X_reduced_test = pca.transform(X_test)
pca.explained_variance_ratio_.sum()

0.9509887516520821

### Test model

In [58]:
import time
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [49]:
X_training, X_valid, y_training, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

#### SVC

In [15]:
start = time.process_time()

In [16]:
model_svc = SVC(probability=True)
model_svc.fit(X_training, y_training)
predicts_svc = model_svc.predict(X_valid)
accuracy_score(predicts_svc, y_valid)

0.3841280426808936

In [17]:
cohen_kappa_score(predicts_svc, y_valid, weights='quadratic')

0.29720426470259464

In [18]:
time.process_time() - start

84.90625

#### Logistic Regression

In [19]:
start = time.process_time()

In [20]:
model_lg = LogisticRegression()
model_lg.fit(X_training, y_training)
predicts_lg = model_lg.predict(X_valid)
accuracy_score(predicts_lg, y_valid)

0.3484494831610537

In [21]:
cohen_kappa_score(predicts_lg, y_valid, weights='quadratic')

0.21927543190867183

In [22]:
time.process_time() - start

0.84375

#### Random Forest

In [60]:
start = time.process_time()

In [61]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
model_rf.fit(X_training, y_training)
predicts_rf = model_rf.predict(X_valid)
accuracy_score(predicts_rf, y_valid)

0.37912637545848615

In [62]:
cohen_kappa_score(predicts_rf, y_valid, weights='quadratic')

0.2807764420865635

In [63]:
time.process_time() - start

3.78125

#### Light GBM

In [32]:
start = time.process_time()

In [33]:
model_lgb = LGBMClassifier(random_state=0, learning_rate=0.1)
model_lgb.fit(X_training, y_training)
predicts_lgb = model_lgb.predict(X_valid)
accuracy_score(predicts_lgb, y_valid)

0.43014338112704237

In [34]:
cohen_kappa_score(predicts_lgb, y_valid, weights='quadratic')

0.3794500871344002

In [35]:
time.process_time() - start

366.71875

#### Voting

In [8]:
lg = LGBMClassifier(random_state=0, learning_rate=0.1)
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
sv = SVC(probability=True)
voting_clf = VotingClassifier(estimators=[('lg', lg), ('rf', rf), ('sv', sv)], voting='soft')
pipe = make_pipeline(pca, voting_clf)

In [10]:
%%time
score = cross_val_score(pipe, X_train, y_train, cv=3)

Wall time: 3min 24s


In [12]:
score.mean()

0.3819112729475755

In [15]:
pipe.fit(X_training, y_training)

Pipeline(steps=[('pca', PCA(n_components=21)),
                ('votingclassifier',
                 VotingClassifier(estimators=[('lg',
                                               LGBMClassifier(random_state=0)),
                                              ('rf',
                                               RandomForestClassifier(max_depth=10,
                                                                      random_state=1)),
                                              ('sv', SVC(probability=True))],
                                  voting='soft'))])

In [17]:
predicts_vote = pipe.predict(X_valid)
cohen_kappa_score(predicts_vote, y_valid, weights='quadratic')

0.3000845221393976

#### Tensorflow

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
tf.random.Generator = None  # Patch for a bug
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.1.0-rc0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [19]:
y_train_tf = np.eye(5)[y_training]
y_valid_tf = np.eye(5)[y_valid]

In [21]:
model = Sequential([Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
                    Dropout(0.2),
                    Dense(64, activation='relu'),
                    Dropout(0.2),
                    Dense(5, activation='softmax')
])
model.compile(optimizer='adam', loss=tfa.losses.kappa_loss.WeightedKappaLoss(num_classes=5))

In [22]:
history = model.fit(X_training, y_train_tf, batch_size=32, epochs=10, validation_data=[X_valid, y_valid_tf])

Train on 11994 samples, validate on 2999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
pred = model.predict(X_valid)
y_pred = np.argmax(pred, axis=1)
y_val = np.argmax(y_valid_tf, axis=1)

In [24]:
accuracy_score(y_pred, y_val)

0.37912637545848615

In [25]:
cohen_kappa_score(y_pred, y_val, weights='quadratic')

0.3434092307671015

### Prediction

In [331]:
Pet_id = pd.read_csv('test/test.csv').PetID

In [333]:
prediction = model_lgb.predict(X_test)
submission = pd.DataFrame({'PetID':Pet_id, 'AdoptionSpeed':[int(i) for i in prediction]})
submission

Unnamed: 0,PetID,AdoptionSpeed
0,e2dfc2935,4
1,f153b465f,3
2,3c90f3f54,1
3,e02abc8a3,4
4,09f0df7d1,4
...,...,...
3967,ae57f8d52,4
3968,83432904d,4
3969,399013029,4
3970,fd80b8c80,4


In [None]:
#submission.to_csv('submission.csv', index=False)