In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

Load data

In [54]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

Set aside target values for the training

In [24]:
train_label = train['target'].values
train = train.drop(['target'], axis=1)

Combine data

In [25]:
train_test = pd.concat([train,test]).reset_index(drop=True)  #Because pd creates dupplicate indexes we add this argument

Create a list of columns that we want to rescale using the standard scaler from sklearn

In [27]:
cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]

In [28]:
# Use standard scaler to scale the data appropriately so the model can learn the patterns of each feature correctly.
# source: https://www.kaggle.com/ilu000/instagrat-lgbm-baseline
scaler = StandardScaler()

In [29]:
train_test[cols] = scaler.fit_transform(train_test[cols])

Clean this weird data set id column => wheezy-copper-turtle-magic

In [31]:
train_test['wheezy-copper-turtle-magic'] = train_test['wheezy-copper-turtle-magic'].astype('category')

In [32]:
featues_to_use = [c for c in train.columns if c not in ['id', 'target']]

In [46]:
train = train[featues_to_use]
test = test[featues_to_use]

First classifier - logistic regression

In [64]:
classifier = LogisticRegression(C=1, solver='sag')

In [47]:
cv_score = np.mean(cross_val_score(classifier, train, train_label, cv=3, scoring='roc_auc'))

In [48]:
cv_score

0.5295279287846639

In [50]:
# # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
# # Apparently you must train a specific model for each wheezy.

In [55]:
train_1 = train[train['wheezy-copper-turtle-magic']==1]

In [62]:
train_1 = train_1.drop(['wheezy-copper-turtle-magic'], axis=1)

In [57]:
train_1_label = train_1['target'].values
train_1 = train_1.drop(['target'], axis=1)
train_1 = train_1.drop(['id'], axis=1)

In [65]:
cv_score_1 = np.mean(cross_val_score(classifier, train_1, train_1_label, cv=3))



Unnamed: 0,muggy-smalt-axolotl-pembus,dorky-peach-sheepdog-ordinal,slimy-seashell-cassowary-goose,snazzy-harlequin-chicken-distraction,frumpy-smalt-mau-ordinal,stealthy-beige-pinscher-golden,chummy-cream-tarantula-entropy,hazy-emerald-cuttlefish-unsorted,nerdy-indigo-wolfhound-sorted,leaky-amaranth-lizard-sorted,...,goopy-lavender-wolverine-fimbus,wheezy-myrtle-mandrill-entropy,wiggy-lilac-lemming-sorted,gloppy-cerise-snail-contributor,woozy-silver-havanese-gaussian,jumpy-thistle-discus-sorted,muggy-turquoise-donkey-important,blurry-buff-hyena-entropy,bluesy-chocolate-kudu-fepid,gamy-white-monster-expert
550,-0.652685,-0.320448,-0.990136,0.123433,0.872434,-0.976576,0.986943,-0.654385,0.731564,0.285962,...,0.123676,-0.751551,2.214634,0.036100,0.559592,-0.196727,-0.045499,-0.875327,0.949601,0.532729
793,1.100477,0.146391,0.548201,-0.743063,-0.776455,0.784679,2.884915,4.547408,-1.348984,-0.528185,...,-1.573899,0.127049,-0.591817,-0.694301,-0.356082,-0.689719,-0.164489,0.595378,-0.020808,0.527682
1218,1.603370,1.622184,-1.537069,-0.232444,-0.123593,1.186707,-0.748651,4.826237,-2.024633,-1.603305,...,-0.091512,1.463451,0.308423,-1.266921,-1.025875,-0.894283,-1.959455,-0.355648,0.903022,1.032162
1276,-2.773063,-1.316827,1.019951,-0.155688,0.262493,0.118541,-0.536831,1.759811,1.169555,-1.531238,...,0.835824,-0.013601,0.455910,0.700794,-1.073916,-0.359614,0.141195,1.320309,-0.146764,-0.568746
1300,-2.013425,-0.805431,-0.294763,1.629098,-0.013835,0.075623,0.304643,0.027752,-0.959656,-0.989764,...,-1.006383,0.279145,0.675924,-0.492643,0.476203,-0.996044,1.399797,-0.157206,0.015506,1.122774
3558,-1.419609,0.517560,-0.529379,1.229631,0.205109,0.885108,0.050205,-3.710216,-1.104604,2.153404,...,-1.882369,0.210470,-1.397314,-0.410898,-2.080672,-2.453882,-0.126984,-0.255028,0.816611,-0.835924
3932,0.564709,-0.209083,0.064238,0.823393,1.325331,1.035421,0.017893,-2.862382,1.625803,1.285989,...,-0.190572,0.688652,-0.903793,0.632685,-0.362252,0.720990,-0.802499,0.560101,-0.816889,-0.145564
4430,-0.538677,1.178709,0.447925,0.099682,0.843425,0.979992,-1.121682,-0.910071,0.115298,0.273583,...,0.959329,0.421084,-1.529724,-0.424046,-1.591713,0.367157,0.692647,-0.331310,-1.629584,3.169674
4584,-0.038064,-1.064747,0.668884,0.921696,1.438688,0.035187,-0.060923,4.580616,-0.806661,-1.579913,...,-1.240047,-1.741452,-0.245398,1.492197,0.523525,-0.026419,-1.074696,-0.179314,-0.005342,1.471605
5248,-0.317600,0.553256,0.832745,-0.550301,1.356070,0.112474,-0.022811,0.783529,0.917175,-0.017520,...,-1.211252,-1.973426,0.396940,-0.947292,1.750175,1.030208,-1.077848,-0.645586,-0.054977,-0.157199


In [124]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=7).fit(train_1)
new_train_1 = pd.DataFrame(sel.transform(train_1))

In [134]:
new_train_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,-0.654385,-1.32854,-3.298451,5.713286,-0.249997,3.48358,1.607351,4.008529,-5.064177,5.043226,...,-2.189854,0.814339,-2.446194,-4.585459,-8.365499,-1.06559,-2.837308,0.779924,-6.48672,-4.332438
1,4.547408,-7.797243,-4.016878,6.334253,5.651362,5.507575,1.592396,-3.457994,-4.89657,-0.922218,...,3.983167,1.956918,-9.584882,0.162324,-4.772068,7.322311,5.18551,-1.907479,1.264691,1.995123
2,4.826237,2.809694,0.856758,-8.284055,1.456224,0.224991,3.020042,3.577543,-1.736894,-6.713843,...,1.450084,-3.372936,-6.217618,-1.78851,-1.994585,0.998401,-1.813241,3.069175,5.627985,-5.40732
3,1.759811,-8.578211,-1.006939,7.28555,1.170394,3.077543,2.543627,-6.278081,6.347096,-3.20129,...,-0.150275,2.250235,-2.815026,-2.626817,-2.600445,0.89032,-2.067623,0.6217,-2.506708,-1.599139
4,0.027752,4.356375,-1.539064,3.553738,-0.229643,-2.997083,-5.081439,-2.933508,-1.912333,2.264037,...,-1.429352,-2.468143,-0.77077,-2.2452,-3.786541,-2.687195,6.058062,5.916246,-0.162224,-3.848169


In [131]:
from sklearn.decomposition import PCA
new_train_2 = pd.DataFrame(PCA(n_components=40, random_state=4).fit_transform(train_1))

In [133]:
new_train_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-1.356456,-4.264247,-0.508695,0.456618,-6.271778,-6.436266,10.096511,10.148206,-0.015727,-3.350215,...,-2.825606,-1.118587,0.022536,-3.310815,-0.745249,2.036294,-3.249551,1.369388,-0.479212,1.395594
1,1.939933,-4.946836,3.841274,-2.496783,-6.11058,-8.508848,8.846071,-4.379495,8.512449,-4.391139,...,-0.807503,2.598583,-1.352242,2.342954,-1.20063,-0.881041,-0.091498,-1.76996,0.705192,0.030894
2,2.894082,-7.753619,4.535039,0.706786,2.829056,2.236868,2.003342,-2.704269,-1.222243,1.796265,...,-0.336264,-0.882555,1.774876,0.854167,-0.533558,1.951921,-1.459788,1.240819,-1.236243,2.601314
3,1.180745,1.651726,3.118797,-4.214485,0.44668,-0.138143,4.065859,-3.183748,8.771928,-6.986183,...,0.7152,-0.2246,-0.134639,3.322224,0.682937,-0.583827,-0.021389,1.818399,0.23236,-2.031258
4,-4.226382,7.42301,1.054076,1.613187,-0.078899,1.837412,6.418423,-1.273857,5.772864,6.35063,...,1.656027,0.232804,1.55241,0.792169,-0.031092,0.763476,1.817096,0.461203,-3.127337,1.769343


In [136]:
new_train_3 = pd.concat([new_train_1,new_train_2],axis=1).reset_index(drop=True)

In [137]:
new_train_3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-0.654385,-1.328540,-3.298451,5.713286,-0.249997,3.483580,1.607351,4.008529,-5.064177,5.043226,...,-2.825606,-1.118587,0.022536,-3.310815,-0.745249,2.036294,-3.249551,1.369388,-0.479212,1.395594
1,4.547408,-7.797243,-4.016878,6.334253,5.651362,5.507575,1.592396,-3.457994,-4.896570,-0.922218,...,-0.807503,2.598583,-1.352242,2.342954,-1.200630,-0.881041,-0.091498,-1.769960,0.705192,0.030894
2,4.826237,2.809694,0.856758,-8.284055,1.456224,0.224991,3.020042,3.577543,-1.736894,-6.713843,...,-0.336264,-0.882555,1.774876,0.854167,-0.533558,1.951921,-1.459788,1.240819,-1.236243,2.601314
3,1.759811,-8.578211,-1.006939,7.285550,1.170394,3.077543,2.543627,-6.278081,6.347096,-3.201290,...,0.715200,-0.224600,-0.134639,3.322224,0.682937,-0.583827,-0.021389,1.818399,0.232360,-2.031258
4,0.027752,4.356375,-1.539064,3.553738,-0.229643,-2.997083,-5.081439,-2.933508,-1.912333,2.264037,...,1.656027,0.232804,1.552410,0.792169,-0.031092,0.763476,1.817096,0.461203,-3.127337,1.769343
5,-3.710216,5.553852,1.256200,-3.706618,0.790527,-4.849041,-5.849777,1.463909,7.327714,4.742353,...,-4.395186,0.047306,2.450123,4.156287,-2.052297,-0.387037,3.618395,-1.349262,0.595469,-1.706812
6,-2.862382,6.624007,-2.709429,-4.019369,0.026997,5.416191,1.523158,-0.697108,-1.154344,-1.622215,...,0.069691,3.717026,-0.963300,1.011568,2.640306,0.781386,-0.536367,-1.280603,0.165923,-0.885150
7,-0.910071,-0.093873,0.419784,-0.517814,5.988956,-0.522689,6.519285,-0.916194,-7.212979,-3.713302,...,-0.226142,-0.063119,0.014078,-3.233092,0.303956,0.556552,0.957916,1.984120,-0.577600,2.685434
8,4.580616,-3.388254,-0.030388,3.180992,3.435138,-3.937973,0.265639,3.127890,-1.901080,-1.516726,...,-1.157768,2.612081,1.247394,1.372093,-0.844260,-0.024201,-0.316926,-2.392030,-0.790910,-0.045368
9,0.783529,-5.000124,-4.044488,3.502862,5.525613,1.724646,-0.213933,-0.542864,5.443666,-3.085819,...,2.345778,-2.085672,3.711725,-0.187763,-1.706790,0.855540,1.601415,1.172455,1.055022,-1.019902


In [140]:
from sklearn.svm import SVC

In [142]:
scm_model2 = SVC(probability=True,kernel='poly',degree=4,gamma='auto')
svm_score2 = np.mean(cross_val_score(svm_model, new_train_3, train_1_label, cv=3))
svm_score2

0.8764705882352941

In [138]:
svm_model = svm.SVC(gamma='scale', decision_function_shape='ovo',random_state=17)
svm_score = np.mean(cross_val_score(svm_model, new_train_3, train_1_label, cv=3))

In [139]:
svm_score

0.8764705882352941

In [73]:
mlp_model = MLPClassifier()
mlp_score = np.mean(cross_val_score(mlp_model, train_1, train_1_label, cv=3))

In [74]:
mlp_score

0.7568627450980392