https://www.kaggle.com/c/cs5785-fall19-final/

In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import csv

stopWords = stopwords.words('english')
isStopWord = lambda w: w in stopWords or len(w) == 1

In [31]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]

In [32]:
def getFilesFromFolder(folder):
    return listdir(folder)

In [33]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
        
    return data

In [34]:
def get_flat_descriptions_from_folder(folder):
    descriptions = []
    for i in range(10000):
        with open(('cs5785-fall19-final/descriptions_train/{}.txt').format(i), newline='') as f:
            desc1 = []
            reader = csv.reader(f)
            for row in reader:
                desc1.append(row)
            descriptions.append(desc1)
    
    # FIRST FLATTENING
    descriptions2 = []
    for description in descriptions:
        descriptions2.append([desc for sublist in description for desc in sublist])
        
    # MADE THE SENTENCES ALL ONE FOR EACH DESCRIPTION FILE
    flat_descriptions = []
    for description in descriptions2:
        desc1 = []
        for sentence in description:
            desc1 += sentence.split(' ')
        flat_descriptions.append(desc1)
    
    for i in range(len(flat_descriptions)):
        flat_descriptions[i] = (' ').join(flat_descriptions[i]).lower()
    
    return flat_descriptions

In [35]:
flat_descs_train = get_flat_descriptions_from_folder(descTrainFolder)
flat_descs_test = get_flat_descriptions_from_folder(descTestFolder)

In [36]:
#train_descs = [' '.join(set(preprocessing(desc).split())) for desc in flat_descs_train]
#test_descs = [' '.join(set(preprocessing(desc).split())) for desc in flat_descs_test]

# preprocess
train_descs = preprocessing(flat_descs_train)
test_descs = preprocessing(flat_descs_train)

In [None]:
# training data
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = pd.read_csv(resTrainFile, header = None, index_col = None)

# testing data
resTrainFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = pd.read_csv(resTrainFile, header = None, index_col = None)

In [None]:
# function to train logistic regression model, predict with it, and calculate the accuracy and confusion matrix
def Logistic_Regression(x_train, y_train, x_test, y_test):
    
    # initialize and fit logistic regression model with training data
    lr = LogisticRegression(solver = 'lbfgs', max_iter = 10000)
    lr.fit(x_train, y_train)
    
    # predict the result for the testing data
    lr_pred = lr.predict(x_test) 
    
    # calculate accuracy
    lr_acc = accuracy_score(lr_pred, y_test)
    
    # confusion matrix
    cfn_matrix_lr = confusion_matrix(y_test, lr_pred)
   
    # return accuracy and confusion matrix
    return lr_acc,cfn_matrix_lr

In [None]:
# function to train gaussian naive bayes model, predict with it, and calculate the accuracy and confusion matrix
def Gaussian_NB(x_train, y_train, x_test, y_test):
    
    # initialize and fit naive bayes model gaussian prior with training data
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    # predict the results for the test set
    gnb_pred = gnb.predict(x_test) 
    
    # calculate accuracy
    gnb_acc = accuracy_score(gnb_pred, y_test)
    
    # confusion matrix
    cfn_matrix_gnb = confusion_matrix(y_test, gnb_pred)
    
    # return accuracy and confusion matrix
    return gnb_acc, cfn_matrix_gnb

In [None]:
# function to train random forest regressor model, predict with it, and calculate the accuracy and confusion matrix
def Random_Forest(x_train, y_train, x_test, y_test):
    # set up regressor  
    rf_regressor = RandomForestRegressor(max_depth=20) 
    
    # fit regressor 
    rf_regressor.fit(x_train, y_train)
    
    # predict
    rf_pred = rf_regressor.predict(x_test)
    
    # calculate accuracy
    rf_acc = accuracy_score(rf_pred, y_test)
    
    # confusion matrix
    cfn_matrix_rf = confusion_matrix(y_test, rf_pred)
    
    # return accuracy and confusion matrix
    return rf_acc, cfn_matrix_rf

In [None]:
# function to train KNN regressor model, predict with it, and calculate the accuracy and confusion matrix
def KNN_Regressor(x_train, y_train, x_test, y_test):
    # set up regressor  
    knn_regressor = KNeighborsRegressor(max_depth=20) 
    
    # fit regressor 
    knn_regressor.fit(x_train, y_train)
    
    # predict
    knn_pred = knn_regressor.predict(x_test)
    
    # calculate accuracy
    knn_acc = accuracy_score(knn_pred, y_test)
    
    # confusion matrix
    cfn_matrix_knn = confusion_matrix(y_test, knn_pred)
    
    # return accuracy and confusion matrix
    return knn_acc, cfn_matrix_knn

In [None]:
# function to return the number of nearest neighbors specified
def knn_function(data, neighbors_number):
    nbrs = NearestNeighbors(n_neighbors=neighbors_number, algorithm='ball_tree').fit(data)
    return nbrs

In [None]:
# CREATE THE BAG OF WORDS DICTIONARY
BOW = {}
for description in train_descs:
    sentence_lst = description.split(' ')
    for word in sentence_lst:
        BOW[word] = 0
BOW['null'] = 0

In [None]:
# CREATE FEATURE VECTORS for TRAIN
feature_vectors = []
for description in train_descs:
    feat_vec = BOW.copy()
    sentence_lst = description.split(' ')
    for word in sentence_lst:
        if word in feat_vec:
            feat_vec[word] += 1
        else:
            feat_vec['null'] += 1       
    feature_vectors.append(feat_vec)
    
# TURN DICTIONARIES INTO A MATRIX with each row as one description
feature_vector_matrix = []
for feature_vec in feature_vectors:
    feature_vector_matrix.append(list(feature_vec.values()))
    
# NORMALIZE THE FEATURES
feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm

In [22]:
# CREATE FEATURE VECTORS for TEST
feature_vectors = []
for description in test_descs:
    feat_vec = BOW.copy()
    sentence_lst = description.split(' ')
    for word in sentence_lst:
        if word in feat_vec:
            feat_vec[word] += 1
        else:
            feat_vec['null'] += 1       
    feature_vectors.append(feat_vec)
    
# TURN DICTIONARIES INTO A MATRIX with each row as one description
feature_vector_matrix_test = []
for feature_vec in feature_vectors:
    feature_vector_matrix_test.append(list(feature_vec.values()))
    
# NORMALIZE THE FEATURES
feature_vector_matrix_test = sklearn.preprocessing.normalize(feature_vector_matrix_test) # default is L2 norm

NameError: name 'sklearn' is not defined

In [28]:
# method to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.
#le = preprocessing.LabelEncoder()
#le.fit(train_descs)
#encoding = le.transform(train_descs)
#print(le.inverse_transform(encoding))
#print(encoding)

['skateboard use man person picnic top skate table crowd pull watch stage show rid boarder put trick skateboarder'
 'enjoy next someone soup noodle bowl ready serve healthy food asian eat sit chopstick tasty carrot ramen shrimp'
 'man intersection cross drive bus ice icecream concession near truck cream street walk across behind busy'
 ...
 'lamp background bedroom cover small nightstand behind room stand next table sit bunch night wall quilt type paper bed'
 'track station drive metal next wall rail train past silver passenger hour travel sit daytime across subway'
 'man board wave surf huge top big surfboard hit shirt rid white trick wipe fall surfer water']
[7770 1631 4256 ... 3195 9115 4176]


In [31]:
#lr_acc, cfn_matrix_lr = Logistic_Regression(train_descs, train_feat, test_descs, test_feat)
#gnb_acc, cfn_matrix_gnb = Gaussian_NB(train_descs, train_feat, test_descs, test_feat)
rf_acc, cfn_matrix_rf = Random_Forest(train_descs, train_feat, test_descs, test_feat)



ValueError: could not convert string to float: 'skateboard use man person picnic top skate table crowd pull watch stage show rid boarder put trick skateboarder'

In [None]:
#train_feat.drop(train_feat.columns[0], axis=1)
#test = ((((train_feat.loc[:,0]).split('/'))[1]).split('.'))[0]
#test = train_feat.loc[:,0]
#test_2 = (((test.split('/'))[1]).split('.'))[0]
#train_feat[0]
