In [28]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import KFold

from sklearn.multiclass import OneVsRestClassifier

from scipy.spatial.distance import cdist

from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def get_output():
    y = pd.read_csv("processed_tags.csv")
    y.drop(["Name"], axis=1, inplace = True)
    return np.array(y)

In [3]:
def get_data():
    train_lines = np.array([' '.join(f.split()) for i in range(10000) 
                  for f in open("features_train_prepossesing_noun/"
                                + str(i) 
                                + ".txt")])
    unique_words = list(set([item for sublist in train_lines.ravel() for item in sublist.split()]))
    
    count_vect = TfidfVectorizer(vocabulary = unique_words)
    X_train = count_vect.fit_transform(train_lines)
    X_train = X_train.toarray()
    
    test_lines = np.array([' '.join(f.split()) for i in range(2000) 
                  for f in open("features_test_prepossesing_noun/"
                                + str(i) 
                                + ".txt")])
    
    X_test = count_vect.transform(test_lines)
    X_test = X_test.toarray()
    
    return X_train, X_test, unique_words

In [26]:
def make_better_tags(X_train, X_test, unique_words, k):
    unique_tags = []
    for arr in X_train:
        tags = [i for i in arr.argsort()[-k:][::-1]]

        for i in tags:
            if unique_words[i] not in unique_tags:
                unique_tags.append(unique_words[i])
    
    print(len(unique_tags))
    
    d = dict([(tag, 0) for tag in unique_tags])
    
    better_tags = []
    for arr in X_train:
        d_sample = d.copy()

        for i in arr.argsort():
            if unique_words[i] in unique_tags:
                d_sample[unique_words[i]] = arr[i]
        better_tags.append(list(d_sample.values()))

    print(len(better_tags[0]))

    pd.DataFrame(better_tags, columns=unique_tags).to_csv("better_train_{}_tags_nouns.csv".format(k), 
                                                          index=False)
    
    better_tags = []
    for arr in X_test:
        d_sample = d.copy()

        for i in arr.argsort():
            if unique_words[i] in unique_tags:
                d_sample[unique_words[i]] = arr[i]
        better_tags.append(list(d_sample.values()))

    print(len(better_tags[0]))

    pd.DataFrame(better_tags, columns=unique_tags).to_csv("better_test_{}_tags_nouns.csv".format(k), 
                                                          index=False)

## Test models

In [6]:
from sklearn.ensemble import RandomForestClassifier

def test_rf(X_train, y):    
    folds = 5
    skf = KFold(n_splits=folds)
    accuracy_log = []
    
    for train_index, test_index in skf.split(X_train, y):
        print("SPLIT")
        x_train, x_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Create a random forest Classifier. By convention, clf means 'Classifier'
        clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=0))

        clf.fit(x_train, y_train)

        accuracy_log.append(clf.score(x_test, y_test))

        print('RF accuracy:', accuracy_log[-1])
    print("Results")
    print("The average accuracy Random Forest is {}".format(np.mean(accuracy_log)))

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

def test_nn(X_train, y):
    folds = 3
    skf = KFold(n_splits=folds)
    accuracy = []
    
    for train_index, test_index in skf.split(X_train, y):
        print("SPLIT")
        x_train, x_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = Sequential()
        model.add(Dense(10, input_dim=x_train.shape[1], init='normal', activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(y_train.shape[1], init='normal', activation='softmax'))

        # Compile model     #logarithmic  loss     #method
        model.compile(loss='categorical_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])

        # Fit the model
        model.fit(x_train, y_train, 
                  nb_epoch=300, batch_size=100, 
                  validation_data=(x_test, y_test), 
                  verbose=2)

        accuracy.append(model.evaluate(x_test, y_test)[1])
        print()

    print("Results")
    print("The average accuracy Neural Network is {}".format(np.mean(accuracy)))

Using TensorFlow backend.


In [8]:
X_train, X_test, unique_words = get_data()

In [24]:
make_better_tags(X_train, X_test, unique_words, 3)

3323
10000
3323
2000
3323
3323


In [10]:
y = get_output()

In [None]:
test_rf(X_train, y)

In [None]:
# test_nn(X_train, y)

## Predictions

In [19]:
def rf_predict_tags(X_train, X_test, y):
    # Create a random forest Classifier. By convention, clf means 'Classifier'
    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=20, n_jobs=-1, random_state=0))

    clf.fit(X_train, y)
    
    predicted_tags = clf.predict_proba(X_test)
    
    columns = pd.read_csv("processed_tags.csv").columns[1:]
    
    pd.DataFrame(predicted_tags, columns=columns).to_csv("RF_predicted_tags.csv", index=False)

In [12]:
def nn_predict_tags(X_train, X_test, y):
    model = Sequential()
    model.add(Dense(10, input_dim=X_train.shape[1], init='normal', activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], init='normal', activation='softmax'))

    # Compile model     #logarithmic  loss     #method
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

    # Fit the model
    model.fit(X_train, y, 
              nb_epoch=30, batch_size=100,
              verbose=2)

    predicted_tags = model.predict(X_test)
    print(predicted_tags.shape)
    
    columns = pd.read_csv("processed_tags.csv").columns[1:]
    
    pd.DataFrame(predicted_tags, columns=columns).to_csv("NN_predicted_tags.csv", index=False)

In [21]:
def make_output(rf=False, nn=False):
    groud_truth = pd.read_csv("processed_tags_test.csv")
    groud_truth.drop(['Name'], axis=1, inplace=True)
    if rf == True:
        predicted = pd.read_csv("RF_predicted_tags.csv")
        
        simi=cdist(predicted, groud_truth, metric='cosine')
        
        rows = []
        k=0
        for row in simi:
            top20 = row.argsort()[:20]
            rows.append([str(k) + ".txt", " ".join([str(value) + ".jpg" for value in top20])])
            k+=1
        pd.DataFrame(rows, columns=['Descritpion_ID', 'Top_20_Image_IDs']).to_csv("RF_submission.csv", index=False)
    elif nn == True:
        predicted = pd.read_csv("NN_predicted_tags.csv")
        simi=cdist(predicted, groud_truth, metric='cosine')
        
        rows = []
        k=0
        for row in simi:
            top20 = row.argsort()[:20]
            rows.append([str(k) + ".txt", " ".join([str(value) + ".jpg" for value in top20])])
            k+=1
        pd.DataFrame(rows, columns=['Descritpion_ID', 'Top_20_Image_IDs']).to_csv("NN_submission.csv", index=False)
    else:
        1+1

In [20]:
rf_predict_tags(X_train, X_test, y)

In [15]:
# nn_predict_tags(X_train, X_test, y)

In [22]:
make_output(rf=True)