In [162]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import KFold

from sklearn.multiclass import OneVsRestClassifier

from scipy.spatial.distance import cdist

In [76]:
def get_output():
    y = pd.read_csv("processed_tags.csv")
    y.drop(["Name"], axis=1, inplace = True)
    return np.array(y)

In [77]:
def get_data():
    train_lines = np.array([' '.join(f.split()) for i in range(10000) 
                  for f in open("features_train_prepossesing/"
                                + str(i) 
                                + ".txt")])
    unique_words = list(set([item for sublist in train_lines.ravel() for item in sublist.split()]))
    
    count_vect = TfidfVectorizer(vocabulary = unique_words)
    X_train = count_vect.fit_transform(train_lines)
    X_train = X_train.toarray()
    
    test_lines = np.array([' '.join(f.split()) for i in range(2000) 
                  for f in open("features_test_prepossesing/"
                                + str(i) 
                                + ".txt")])
    
    X_test = count_vect.transform(test_lines)
    X_test = X_test.toarray()
    
    return X_train, X_test, unique_words

In [83]:
def make_better_tags(X_train, X_test, unique_words, k):
    unique_tags = []
    for arr in X_train:
        tags = [i for i in arr.argsort()[-k:][::-1]]

        for i in tags:
            if unique_words[i] not in unique_tags:
                unique_tags.append(unique_words[i])
    
    print(len(unique_tags))
    
    d = dict([(tag, 0) for tag in unique_tags])
    
    better_tags = []
    for arr in X_train:
        d_sample = d.copy()
        tags = [i for i in arr.argsort()[-k:][::-1]]

        for i in tags:
            if unique_words[i] in unique_tags:
                d_sample[unique_words[i]] = arr[i]
        better_tags.append(list(d_sample.values()))

    print(len(better_tags))
    print(len(better_tags[0]))

    pd.DataFrame(better_tags, columns=unique_tags).to_csv("better_train_{}_tags.csv".format(k), 
                                                          index=False)
    
    better_tags = []
    for arr in X_test:
        d_sample = d.copy()
        tags = [i for i in arr.argsort()[-k:][::-1]]

        for i in tags:
            if unique_words[i] in unique_tags:
                d_sample[unique_words[i]] = arr[i]
        better_tags.append(list(d_sample.values()))

    print(len(better_tags))
    # print(better_tags)
    print(len(better_tags[0]))

    print(len(unique_tags))

    pd.DataFrame(better_tags, columns=unique_tags).to_csv("better_test_{}_tags.csv".format(k), 
                                                          index=False)

## Test models

In [186]:
from sklearn.ensemble import RandomForestClassifier

def test_rf(X_train, y):    
    folds = 5
    skf = KFold(n_splits=folds)
    accuracy_log = []
    
    for train_index, test_index in skf.split(X_train, y):
        print("SPLIT")
        x_train, x_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Create a random forest Classifier. By convention, clf means 'Classifier'
        clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=0))

        clf.fit(x_train, y_train)

        accuracy_log.append(clf.score(x_test, y_test))

        print('RF accuracy:', accuracy_log[-1])
    print("Results")
    print("The average accuracy Random Forest is {}".format(np.mean(accuracy_log)))

In [181]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

def test_nn(X_train, y):
    folds = 3
    skf = KFold(n_splits=folds)
    accuracy = []
    
    for train_index, test_index in skf.split(X_train, y):
        print("SPLIT")
        x_train, x_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = Sequential()
        model.add(Dense(10, input_dim=x_train.shape[1], init='normal', activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(y_train.shape[1], init='normal', activation='softmax'))

        # Compile model     #logarithmic  loss     #method
        model.compile(loss='categorical_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])

        # Fit the model
        model.fit(x_train, y_train, 
                  nb_epoch=300, batch_size=100, 
                  validation_data=(x_test, y_test), 
                  verbose=2)

        accuracy.append(model.evaluate(x_test, y_test)[1])
        print()

    print("Results")
    print("The average accuracy Neural Network is {}".format(np.mean(accuracy)))

In [85]:
X_train, X_test, unique_words = get_data()

In [None]:
# make_better_tags(X_train, X_test, unique_words, 5)

In [None]:
y = get_output()

In [None]:
test_rf(X_train, y)

SPLIT


In [182]:
# test_nn(X_train, y)

SPLIT
Train on 6666 samples, validate on 3334 samples
Epoch 1/300
1s - loss: 20.5355 - acc: 0.0456 - val_loss: 20.0311 - val_acc: 0.0264
Epoch 2/300
1s - loss: 19.3469 - acc: 0.0419 - val_loss: 18.6029 - val_acc: 0.0330
Epoch 3/300
1s - loss: 18.3507 - acc: 0.0881 - val_loss: 18.0466 - val_acc: 0.0951
Epoch 4/300
1s - loss: 17.9785 - acc: 0.0954 - val_loss: 17.7424 - val_acc: 0.0951
Epoch 5/300
1s - loss: 17.6747 - acc: 0.0984 - val_loss: 17.4012 - val_acc: 0.0975
Epoch 6/300
1s - loss: 17.3008 - acc: 0.1290 - val_loss: 16.9838 - val_acc: 0.1509
Epoch 7/300
1s - loss: 16.8614 - acc: 0.1697 - val_loss: 16.5185 - val_acc: 0.1869
Epoch 8/300
1s - loss: 16.4170 - acc: 0.1938 - val_loss: 16.0560 - val_acc: 0.2010
Epoch 9/300
1s - loss: 16.0004 - acc: 0.1997 - val_loss: 15.6320 - val_acc: 0.2115
Epoch 10/300
1s - loss: 15.6330 - acc: 0.2147 - val_loss: 15.2548 - val_acc: 0.2232
Epoch 11/300
1s - loss: 15.3027 - acc: 0.2103 - val_loss: 14.9281 - val_acc: 0.2433
Epoch 12/300
1s - loss: 15.0342

KeyboardInterrupt: 

## Predictions

In [117]:
def rf_predict_tags(X_train, X_test, y):
    # Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1, random_state=0))

    clf.fit(X_train, y)
    
    predicted_tags = clf.predict_proba(X_test)
    
    columns = pd.read_csv("processed_tags.csv").columns[1:]
    
    pd.DataFrame(predicted_tags, columns=columns).to_csv("RF_predicted_tags.csv", index=False)

In [118]:
def nn_predict_tags(X_train, X_test, y):
    model = Sequential()
    model.add(Dense(10, input_dim=X_train.shape[1], init='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], init='normal', activation='softmax'))

    # Compile model     #logarithmic  loss     #method
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

    # Fit the model
    model.fit(X_train, y, 
              nb_epoch=30, batch_size=100,
              verbose=2)

    predicted_tags = model.predict(X_test)
    print(predicted_tags.shape)
    
    columns = pd.read_csv("processed_tags.csv").columns[1:]
    
    pd.DataFrame(predicted_tags, columns=columns).to_csv("NN_predicted_tags.csv", index=False)

In [177]:
def make_output(rf=False, nn=False):
    groud_truth = pd.read_csv("processed_tags_test.csv")
    groud_truth.drop(['Name'], axis=1, inplace=True)
    if rf == True:
        predicted = pd.read_csv("RF_predicted_tags.csv")
        
        simi=cdist(predicted, groud_truth, metric='cosine')
        
        rows = []
        k=0
        for row in simi:
            top20 = row.argsort()[:20]
            rows.append([str(k) + ".txt", " ".join([str(value) + ".jpg" for value in top20])])
            k+=1
        pd.DataFrame(rows, columns=['Descritpion_ID', 'Top_20_Image_IDs']).to_csv("RF_submission.csv", index=False)
    elif nn == True:
        predicted = pd.read_csv("NN_predicted_tags.csv")
        simi=cdist(predicted, groud_truth, metric='cosine')
        
        rows = []
        k=0
        for row in simi:
            top20 = row.argsort()[:20]
            rows.append([str(k) + ".txt", " ".join([str(value) + ".jpg" for value in top20])])
            k+=1
        pd.DataFrame(rows, columns=['Descritpion_ID', 'Top_20_Image_IDs']).to_csv("NN_submission.csv", index=False)
    else:
        1+1

In [119]:
rf_predict_tags(X_train, X_test, y)

In [116]:
nn_predict_tags(X_train, X_test, y)

Epoch 1/30
1s - loss: 20.2197 - acc: 0.0713
Epoch 2/30
1s - loss: 18.5220 - acc: 0.0815
Epoch 3/30
1s - loss: 17.9111 - acc: 0.0952
Epoch 4/30
2s - loss: 17.3521 - acc: 0.0974
Epoch 5/30
1s - loss: 16.6828 - acc: 0.1422
Epoch 6/30
2s - loss: 16.1139 - acc: 0.2013
Epoch 7/30
1s - loss: 15.6410 - acc: 0.2143
Epoch 8/30
1s - loss: 15.2501 - acc: 0.2270
Epoch 9/30
1s - loss: 14.9456 - acc: 0.2391
Epoch 10/30
1s - loss: 14.6585 - acc: 0.2395
Epoch 11/30
1s - loss: 14.4825 - acc: 0.2439
Epoch 12/30
1s - loss: 14.2730 - acc: 0.2498
Epoch 13/30
1s - loss: 14.1445 - acc: 0.2479
Epoch 14/30
1s - loss: 14.0271 - acc: 0.2550
Epoch 15/30
1s - loss: 13.8697 - acc: 0.2525
Epoch 16/30
1s - loss: 13.7837 - acc: 0.2524
Epoch 17/30
1s - loss: 13.6638 - acc: 0.2585
Epoch 18/30
1s - loss: 13.6445 - acc: 0.2600
Epoch 19/30
1s - loss: 13.5294 - acc: 0.2600
Epoch 20/30
1s - loss: 13.4714 - acc: 0.2566
Epoch 21/30
1s - loss: 13.4250 - acc: 0.2613
Epoch 22/30
1s - loss: 13.3674 - acc: 0.2639
Epoch 23/30
1s - lo

In [178]:
make_output(nn=True)

  dm /= _row_norms(XB)
