In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")

In [2]:
#loading training data
all_genres_data = pd.read_csv('~/Downloads/COMP309A3-Data/allGenres.csv', index_col=0)

In [3]:
#importing testing data
unlabelled_data = pd.read_csv('~/Downloads/COMP309A3-Data/testing-instances.csv', index_col=0)

In [4]:
#replace the undefined values with NaN
all_genres_data.replace('?', np.nan, inplace=True)
all_genres_data["duration_ms"].replace(-1, np.nan, inplace=True)

unlabelled_data.replace('?', np.nan, inplace=True)
unlabelled_data["duration_ms"].replace(-1, np.nan, inplace=True)

In [5]:
#create new attribute
all_genres_data['Number in track_name'] = ''
unlabelled_data['Number in track_name'] = ''

In [6]:
#function to find if a string contain numbers
def has_digits(str):
    return any(char.isdigit() for char in str)

In [7]:
#fill the newly created attributed using the function from last cell
for i, name in all_genres_data['track_name'].iteritems():
    all_genres_data.at[i,'Number in track_name']=has_digits(name)
    
for i, name in unlabelled_data['track_name'].iteritems():
    unlabelled_data.at[i,'Number in track_name']=has_digits(name)

In [8]:
#create new attribute
all_genres_data['Japanese caharacter in artist_name'] = ''
unlabelled_data['Japanese caharacter in artist_name'] = ''

In [9]:
#function to find if a string contain japanese alphabets
import re
pattern = "[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]"
def has_jap_char(str):
    if(re.search(pattern, str)):
        return True
    else:
        return False

In [10]:
#fill the newly created attributed using the function from last cell
for i, name in all_genres_data['artist_name'].iteritems():
    all_genres_data.at[i,'Japanese caharacter in artist_name']=has_jap_char(name)
    
for i, name in unlabelled_data['artist_name'].iteritems():
    unlabelled_data.at[i,'Japanese caharacter in artist_name']=has_jap_char(name)

In [11]:
#remove unnecessary attributes
all_genres_data.drop(['track_hash', 'obtained_date', 'Genre', 'track_name','artist_name'], axis=1, inplace=True)
unlabelled_data.drop(['track_hash', 'obtained_date', 'track_name','artist_name'], axis=1, inplace=True)

In [12]:
#create the x and y and training and testing data from the data with samples
X=all_genres_data[all_genres_data.columns[:-3]]
X['Number in track_name'] = all_genres_data['Number in track_name']
X['Japanese caharacter in artist_name'] = all_genres_data['Japanese caharacter in artist_name']
y=all_genres_data[all_genres_data.columns[-3]]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [13]:
#encode the categorial attributes using label encoder
from sklearn.preprocessing import LabelEncoder

preprocessed_X_train = X_train.copy()

encoder1 = LabelEncoder()
preprocessed_X_train['key'] = encoder1.fit_transform(preprocessed_X_train['key'].values.reshape(-1,1))

encoder2 = LabelEncoder()
preprocessed_X_train['mode'] = encoder2.fit_transform(preprocessed_X_train['mode'].values.reshape(-1,1))

encoder3 = LabelEncoder()
preprocessed_X_train['Number in track_name'] = encoder3.fit_transform(preprocessed_X_train['Number in track_name'].values.reshape(-1,1))

encoder4 = LabelEncoder()
preprocessed_X_train['Japanese caharacter in artist_name'] = encoder4.fit_transform(preprocessed_X_train['Japanese caharacter in artist_name'].values.reshape(-1,1))

In [14]:
#impute the missing values using KNN imputer
from sklearn.impute import KNNImputer

imputer = KNNImputer()
preprocessed_X_train = imputer.fit_transform(preprocessed_X_train)

preprocessed_X_train = pd.DataFrame(preprocessed_X_train, columns = X_train.columns)

In [15]:
#normalize the data using standard scaler
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

preprocessed_X_train = scaler.fit_transform(preprocessed_X_train)

preprocessed_X_train = pd.DataFrame(preprocessed_X_train, columns = X_train.columns)

In [16]:
#do all 3 above steps on the testing data generated from the sample data
preprocessed_X_test = X_test.copy()
preprocessed_X_test['key'] = encoder1.transform(preprocessed_X_test['key'].values.reshape(-1,1))
preprocessed_X_test['mode'] = encoder2.transform(preprocessed_X_test['mode'].values.reshape(-1,1))
preprocessed_X_test['Number in track_name'] = encoder3.transform(preprocessed_X_test['Number in track_name'].values.reshape(-1,1))
preprocessed_X_test['Japanese caharacter in artist_name'] = encoder4.transform(preprocessed_X_test['Japanese caharacter in artist_name'].values.reshape(-1,1))
preprocessed_X_test = imputer.transform(preprocessed_X_test)
preprocessed_X_test = pd.DataFrame(preprocessed_X_test, columns = X_test.columns)
preprocessed_X_test = scaler.transform(preprocessed_X_test)
preprocessed_X_test = pd.DataFrame(preprocessed_X_test, columns = X_test.columns)

In [17]:
#do all 3 above steps on the unlabelled testing data 
prediction_X = unlabelled_data.copy()
prediction_X['key'] = encoder1.transform(prediction_X['key'].values.reshape(-1,1))
prediction_X['mode'] = encoder2.transform(prediction_X['mode'].values.reshape(-1,1))
prediction_X['Number in track_name'] = encoder3.transform(prediction_X['Number in track_name'].values.reshape(-1,1))
prediction_X['Japanese caharacter in artist_name'] = encoder4.transform(prediction_X['Japanese caharacter in artist_name'].values.reshape(-1,1))
prediction_X = imputer.transform(prediction_X)
prediction_X = pd.DataFrame(prediction_X, columns = unlabelled_data.columns)
prediction_X = scaler.transform(prediction_X)
prediction_X = pd.DataFrame(prediction_X, columns = unlabelled_data.columns)

In [18]:
#train the data using different classification steps and print the score on the labelled testing data
from sklearn.metrics import accuracy_score

In [19]:
#MLP classifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(alpha=1e-10) 
clf.fit(preprocessed_X_train, y_train)
print("Accuracy = ", accuracy_score(clf.predict(preprocessed_X_test), y_test))

Accuracy =  0.5850666666666666


In [20]:
#Saving the prediciton to a csv file for submission
prediction_y = clf.predict(prediction_X)
prediction = pd.DataFrame(prediction_X.index+1,prediction_y)
display(prediction)
prediction.to_csv('answer.csv')

Unnamed: 0,0
Anime,1
Electronic,2
Classical,3
Alternative,4
Jazz,5
...,...
Rap,19996
Electronic,19997
Alternative,19998
Anime,19999
