In [125]:
# This function should open a data file in tsv or csv, and transform it into a usable format

# Import necessary library -- pandas to read file
import pandas as pd

#Read data from tsv file
def load_tsvdata(tsvFileName):
    
    # Read data and return data
    data = pd.read_csv(open(tsvFileName), sep = '\t')
    return data

#Read data from csv file
def load_csvdata(csvFileName):
    
    # Read data and return data
    data = pd.read_csv(open(csvFileName))
    return data

#Drop useless columns in FEATURES
def preprocess_features(featureStream):
    
    # 3 columns is useless in features: YTId, year, movieId
    featureStream.drop('YTId', axis = 1, inplace = True)
    featureStream.drop('year', axis = 1, inplace = True)
    featureStream.drop('movieId', axis = 1, inplace = True)

#Drop useless columns in LABLES  
def preprocess_labels(labelStream):
    
    # 1 columns is useless in labels: movieId
    labelStream.drop('movieId', axis = 1, inplace = True)

In [126]:
#load 5 files and name variables
train_F = load_tsvdata("train_features.tsv")
train_L = load_tsvdata("train_labels.tsv")
valid_F = load_tsvdata("valid_features.tsv")
valid_L = load_tsvdata("valid_labels.tsv")
test_F = load_csvdata("test_features.csv")

#drop useless columns in different variables.
preprocess_features(train_F)
preprocess_features(valid_F)
preprocess_features(test_F)
preprocess_labels(train_L)
preprocess_labels(valid_L)

In [127]:
# Import necessary library -- TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

""" 
funtion: finish TFIDF process
input: column: name of column. eg: 'title', 'tag'
       dfmin: int or float. Ignore terms that have a document frequency strictly lower than the given threshold
                            If float in range of [0.0, 1.0], the parameter represents a proportion of documents. 
       dfmax: int or float. similar with dfmin.
return: TFIDF for train, valid and test. The features must process same opeartion 

"""
def TFIDF(column, dfmin = 1, dfmax = 0.5):
    vectorizer = TfidfVectorizer(stop_words='english', min_df = dfmin, max_df = dfmax)
    train = vectorizer.fit_transform(train_F[column].values.astype('U'))
    train_TFIDF = pd.DataFrame(train.toarray())
    words = vectorizer.get_feature_names()
    train_TFIDF.columns = words
    
    valid = vectorizer.transform(valid_F[column].values.astype('U'))
    valid_TFIDF = pd.DataFrame(valid.toarray())
    valid_TFIDF.columns = words

    test = vectorizer.transform(test_F[column].values.astype('U'))
    test_TFIDF = pd.DataFrame(test.toarray())
    test_TFIDF.columns = words
    return train_TFIDF, valid_TFIDF, test_TFIDF

#merge the original data, TFIDF for title and TFIDF for tag to train.
def final_input(data, title_TFIDF, tag_TFIDF):
    inputdata = pd.concat([data, title_TFIDF, tag_TFIDF], axis=1)
    #since we take the key words in title and tag, the raw data is useless now.
    inputdata.drop('title', axis = 1, inplace = True)
    inputdata.drop('tag', axis = 1, inplace = True)
    return inputdata

#normaliztion each columns.
def normalization():
    #store the columns should be droped. If the values of one column are same in every instances, this column is useless.
    droplist = []
    for i in range(len(train_final.columns)): #Min-Max Normalization or Standard score.
        max = train_final.iloc[:,i].max()
        min = train_final.iloc[:,i].min()
        
        if max == min:
            droplist.append(train_data.iloc[:,i].name)
            continue
        #The features must process same opeartion 
        train_final.iloc[:,i] = (train_final.iloc[:,i] - min)/(max - min)
        valid_final.iloc[:,i] = (valid_final.iloc[:,i] - min)/(max - min)
        test_final.iloc[:,i] = (test_final.iloc[:,i] - min)/(max - min)
        
    for i in droplist:
        #The features must process same opeartion 
        train_final.drop(i, axis = 1, inplace = True)
        valid_final.drop(i, axis = 1, inplace = True)
        test_final.drop(i, axis = 1, inplace = True)

In [276]:
#get the TFIDF results for "title" and "tag"
train_title_TFIDF, valid_title_TFIDF, test_title_TFIDF = TFIDF('title', 9, 0.3) #(3,0.3) (5,0.3) (7,0.3), (20,0.3)
train_tag_TFIDF, valid_tag_TFIDF, test_tag_TFIDF = TFIDF('tag')#(10,0.5) (15,0.5) (20,0.5), (25,0.5)
train_title_TFIDF

Unnamed: 0,adventures,america,american,angel,baby,bad,batman,battle,beach,beast,...,wild,wind,wolf,woman,women,world,year,york,young,zombies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [277]:
#merge and normalization
train_final = final_input(train_F, train_title_TFIDF, train_tag_TFIDF)
valid_final = final_input(valid_F, valid_title_TFIDF, valid_tag_TFIDF)
test_final = final_input(test_F, test_title_TFIDF, test_tag_TFIDF)
normalization()

In [292]:
#this part define 7 models. I tried all of them. But I only detailed writed Naive Bayes and SVM(SVC) in report.

"""
def DeTree(train_X, train_Y): 
    from sklearn import tree
    Dtree = tree.DecisionTreeClassifier(criterion='gini') #entropy
    Dtree.fit(train_X, train_L.values.ravel())
    return Dtree

def RandomF(train_X, train_Y):
    from sklearn.ensemble import RandomForestClassifier
    Rfc = RandomForestClassifier(criterion='entropy')
    Rfc.fit(train_X, train_Y.values.ravel())
    return Rfc

def KNN(train_X, train_Y, num_neighbors):
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors = num_neighbors)# 71, 61, 51, 81, 91
    knn.fit(train_X, train_Y.values.ravel())
    return knn
"""   

def MNB(train_X, train_Y):
    from sklearn.naive_bayes import MultinomialNB
    mnb = MultinomialNB()
    mnb.fit(train_X, train_Y.values.ravel())
    return mnb

def BNB(train_X, train_Y):
    from sklearn.naive_bayes import BernoulliNB
    bnb = BernoulliNB()
    bnb.fit(train_X, train_Y.values.ravel())
    return bnb

def GNB(train_X, train_Y):
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(train_X, train_Y.values.ravel())
    return gnb

def SVM(train_X, train_Y):
    from sklearn import svm
    #svc = svm.SVC(kernel='linear')# linear in OK for this problem, but I tried svm.NuSVC(gamma='auto')
    svc = svm.SVC(kernel= 'rbf', random_state = 0)
    svc.fit(train_X, train_Y.values.ravel())
    return svc


In [293]:
#knn = KNN(train_final, train_L, 71)
#mnb = MNB(train_final, train_L)
#bnb = BNB(train_final, train_L)
#gnb = GNB(train_final, train_L)
svm = SVM(train_final, train_L)
#detree = DeTree(train_final, train_L)
#randomf = RandomF(train_final, train_L)

In [294]:
# This function should predict the class for an instance or a set of instances, based on a trained model
def predict(model, predict_X):
    return model.predict(predict_X)

In [295]:
# This function should evaluate a set of predictions in terms of accuracy
def evaluate(predict_real, predict_Y):
    from sklearn.metrics import accuracy_score
    return accuracy_score(predict_real, predict_Y)

In [296]:
#predict_Y_knn = predict(knn, valid_final)
#predict_Y_mnb = predict(mnb, valid_final)
#predict_Y_bnb = predict(bnb, valid_final)
#predict_Y_gnb = predict(gnb, valid_final)
predict_Y_svm = predict(svm, valid_final)
#predict_Y_detree = predict(detree, valid_final)
#predict_Y_randomf = predict(randomf, valid_final)

In [297]:
#print(evaluate(valid_L ,predict_Y_knn))
#print(evaluate(valid_L ,predict_Y_mnb))
#print(evaluate(valid_L ,predict_Y_bnb))
#print(evaluate(valid_L ,predict_Y_gnb))
print(evaluate(valid_L ,predict_Y_svm))
#print(evaluate(valid_L ,predict_Y_detree))
#print(evaluate(valid_L ,predict_Y_randomf))

0.38127090301003347


In [136]:
# download the prediction of test and output csv file.
def test_submit_files(model):
    predict_test = predict(model, test_final)
    movieID = load_csvdata("test_features.csv")['movieId']
    submit = pd.concat([pd.DataFrame(movieID), pd.DataFrame(predict_test)], axis=1)
    submit.columns = ['movieId', 'genres']
    submit.to_csv(r'submit.csv', index = False, header=True)

In [137]:
test_submit_files(svm) # download csv file using svm model.