<a href="https://colab.research.google.com/github/lblum95/AML/blob/master/task1-Jannik.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AML first project

## Data ingestion

### Connect to drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import all data libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold

### Import data

In [3]:
x_train = pd.read_csv("data/X_train.csv",index_col=0,header = 0)
y_train = pd.read_csv("data/y_train.csv",index_col=0,header = 0)
x_test = pd.read_csv("data/X_test.csv",index_col=0,header = 0)

### Data cleaning


Drop constant columns

In [4]:
x_train = x_train.loc[:,x_train.apply(pd.Series.nunique) != 1]
x_test = x_test.loc[:,x_test.apply(pd.Series.nunique) != 1]
print(x_train.shape)

(1212, 828)




Fill mean in NaNs

In [5]:
imp = IterativeImputer(n_nearest_features=10)
x_train = imp.fit_transform(x_train, y_train)
x_test = imp.transform(x_test)
print(x_train.shape)

(1212, 828)


Outlier detection

In [6]:
iso =IsolationForest(contamination=0.04).fit(x_train,y_train)
clfTrain = iso.predict(x_train)
clfTest = iso.predict(x_test)
print(x_train.shape)

(1212, 828)


Scale data

In [7]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
print(x_train.shape)

(1212, 828)


## Feature selection

### import all libraries

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression


### Use KBest to get features

In [9]:
kbest = SelectKBest(f_regression,k=92).fit(x_train,y_train.values.ravel())
x_train = kbest.transform(x_train)
x_test = kbest.transform(x_test)
print(x_train.shape)

(1212, 92)


## Training model

Import libraries

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from tensorflow import keras
from tensorflow.keras import optimizers, losses, activations, models
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Input, Dropout

### SVR

In [13]:
fold = KFold(n_splits = 5)
for train_index, test_index in fold.split(x_train):
  x_trainCV = x_train[train_index]
  x_testCV = x_train[test_index]
  y_trainCV = y_train.values[train_index]
  y_testCV = y_train.values[test_index]
  regr = SVR(C=46.415,max_iter =100000)
  regr.fit(x_trainCV, y_trainCV.ravel())
  y_pred = regr.predict(x_testCV)
  print(r2_score(y_testCV,y_pred))

0.7141245944961871
0.6167773662486944
0.5643071691000952
0.5267176426853076
0.6426405496184833


### Neural Net

Build model

In [68]:
def get_model(width):
    inp = Input(shape=(width))
    dense_1 = Dense(width, activation=activations.selu,input_dim=width, name="dense_1")(inp)
    dense_1 = Dropout(rate=0.2)(dense_1)
    dense_1 = Dense(width, activation=activations.selu, name="dense_2")(dense_1)
    dense_1 = Dropout(rate=0.2)(dense_1)
    dense_1 = Dense(width, activation=activations.selu, name="dense_3")(dense_1)
    dense_1 = Dropout(rate=0.2)(dense_1)
    dense_1 = Dense(width, activation=activations.selu, name="dense_4")(dense_1)
    dense_1 = Dense(1, name="output")(dense_1)
    model = models.Model(inputs=inp, outputs=dense_1)
    opt = optimizers.Adam(0.01)
    model.compile(optimizer=opt, loss=losses.mean_squared_error, metrics=['mse'])
    model.summary()
    return model
model=get_model(x_train.shape[1])
early = EarlyStopping(monitor="val_mse", patience=10, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_mse", patience=5, verbose=2)
callbacks_list = [early, redonplat]

Model: "functional_56"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_31 (InputLayer)        [(None, 92)]              0         
_________________________________________________________________
dense_1 (Dense)              (None, 92)                8556      
_________________________________________________________________
dropout_73 (Dropout)         (None, 92)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 92)                8556      
_________________________________________________________________
dropout_74 (Dropout)         (None, 92)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 92)                8556      
_________________________________________________________________
dropout_75 (Dropout)         (None, 92)              

### Fit model
Test overfitted loss

In [None]:
model.fit(np.asarray(x_train), np.asarray(y_train), epochs=1000, verbose=2, callbacks=callbacks_list, validation_split=0.2)
y_pred = model.predict(x_train)
print(r2_score(y_train,y_pred))

## Write to file

In [69]:
y_pred=model.predict(x_test)
df = pd.DataFrame(y_pred)
df.to_csv('keras',header = ['y'], index_label = 'id')