1. Get train data and test data
2. Split train data (train, eval) right away to simulate train vs test data
3. Handle missing values (na and 'insert')
4. Handle outliers

In [302]:
import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import os
from scipy import stats
from sklearn.preprocessing import OneHotEncoder

In [303]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape)
print(df_test.shape)

(170000, 19)
(30000, 18)


In [304]:

def convertToNum(a):
    if a == 'Worst Concert Ever':
        return 0
    elif a == 'Did Not Enjoy':
        return 1
    elif a == 'Enjoyed':
        return 2
    else:
        return 3
    

#df_train['Num Concert Enjoyment'] = df_train['Concert Enjoyment'].replace(
#    ['Worst Concert Ever', 'Did Not Enjoy', 'Enjoyed', 'Best Concert Ever'], 
#    [0,1,2,3]
#)

In [305]:
df_train, df_eval = np.split(df_train, [int(0.95*len(df_train))])
print(df_train.shape)
print(df_eval.shape)

(161500, 19)
(8500, 19)


# To Lower and To String

In [306]:
def toLower(a):
    if a == None or a != a:
        pass
    else:
        a = a.lower()
    return a

def toString(a):
  if a == None or a!=a :    #if a is none or nan on pass 
    pass 
  else:
    a = str(int(a))   
  return a

for c in ["Band Country of Origin","Band Name","Band Genre","Concert Goer Country of Origin"]:
    df_train[c] = df_train[c].map(toLower)
    df_eval[c] = df_eval[c].map(toLower)
    df_test[c] = df_test[c].map(toLower)
    
for c in ['Concert ID']:
    df_train[c] = df_train[c].map(toString)
    df_eval[c] = df_eval[c].map(toString)
    df_test[c] = df_test[c].map(toString)

# By Decile

In [307]:
def groupByDecile(a):
  if a == None or a!=a :     
    pass 
  elif a< 20:
    a = str(16)     #ca permet de faire commencer à 16 ans l'intervalle des âges sinon ce serait 10 ans X(
  else :
    a = a - a%10        #  Ex : 1976 - 1976%10 = 1970
    a = str(int(a))
  return a

for c in ["Band Debut","Concert Goer Age"]:
    df_train[c] = df_train[c].map(groupByDecile)
    df_eval[c] = df_eval[c].map(groupByDecile)
    df_test[c] = df_test[c].map(groupByDecile)

# Handle Missing values

In [308]:
# Missing values are nan or '*insert*'
# Want to replace by most popular
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train = df_train.drop(columns=['Concert Enjoyment'])
y_train = df_train['Concert Enjoyment']

X_eval = df_eval.drop(columns=['Concert Enjoyment'])
y_eval = df_eval['Concert Enjoyment']

idx_train = X_train.index
idx_eval = X_eval.index

imp.fit(X_train)

cols = X_train.columns.values

X_train = imp.transform(X_train)
X_eval = imp.transform(X_eval)
X_test = imp.transform(df_test)

# Quand soumission on train sur TOUT train
df_train = pd.concat([pd.DataFrame(X_train, columns=cols, index=idx_train), y_train], axis=1)
df_eval = pd.concat([pd.DataFrame(X_eval, columns=cols, index=idx_eval), y_eval], axis=1)
df_test = pd.DataFrame(X_test, columns=cols)


# To Int

In [309]:
cols_to_int = ['Inside Venue','Rain','Seated']
for c in cols_to_int:
    df_train[c] = df_train[c].astype(int)
    df_eval[c] = df_eval[c].astype(int)
    # Quand soummission
    df_test[c] = df_test[c].astype(int)

In [310]:
cols_with_insert = ['Band Name', 'Band Genre', 'Band Country of Origin','Concert Goer Country of Origin']
for c in cols_with_insert:
    df_train.loc[df_train[c].str.contains('insert'), c] = df_train[c].mode()[0]
    df_eval.loc[df_eval[c].str.contains('insert'), c] = df_train[c].mode()[0]
    # Quand soumission on fait tout sur le train mode
    df_test.loc[df_test[c].str.contains('insert'), c] = df_test[c].mode()[0]

# Handle Outliers

In [311]:
def convertConcertAttendance(a):
    if int(a) > 162754:
        a = (a / 10000)
    return a

df_train['Concert Attendance'] = df_train['Concert Attendance'].map(convertConcertAttendance)
df_eval['Concert Attendance'] = df_eval['Concert Attendance'].map(convertConcertAttendance)
# Quand soumission
df_test['Concert Attendance'] = df_test['Concert Attendance'].map(convertConcertAttendance)

In [312]:
df_train.loc[
    (df_train['Personnality Trait 2'] < -5) |
    (df_train['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

df_eval.loc[
    (df_eval['Personnality Trait 2'] < -5) |
    (df_eval['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()


df_test.loc[
    (df_test['Personnality Trait 2'] < -5) |
    (df_test['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

# Quand soumission on le fait sur df_test

# Group Data

In [313]:
def groupCountry(a):
    if a not in ['canada', 'united states of america (usa)', 'united kingdom (uk)']:
        a = 'other'
    return a

df_train['Concert Goer Country of Origin'] = df_train['Concert Goer Country of Origin'].map(groupCountry)
df_eval['Concert Goer Country of Origin'] = df_eval['Concert Goer Country of Origin'].map(groupCountry)
# quand soumission sur test aussi
df_test['Concert Goer Country of Origin'] = df_test['Concert Goer Country of Origin'].map(groupCountry)

# Keep Important Concert IDs

In [314]:
def convertConcertID(a):
    # Top 5 meilleurs average rating qui sont a l'interieur, Pire 5 qui sont a l'intérieur
    if int(a) in [879, 539, 456, 961, 592, 978, 441, 193, 24, 164]:
        return str(a)
    else:
        return 'other'
    
df_train['Concert ID'] = df_train['Concert ID'].map(convertConcertID)
df_test['Concert ID'] = df_test['Concert ID'].map(convertConcertID)
df_eval['Concert ID'] = df_eval['Concert ID'].map(convertConcertID)

# Keep Important Concert Goer IDs

In [315]:
def convertConcertGoerID(a):
    if str(a) in ['concert_goer_695', 'concert_goer_413', 'concert_goer_1392', 'concert_goer_398', 'concert_goer_1770', 'concert_goer_1011', 'concert_goer_1414', 'concert_goer_1419', 'concert_goer_1506', 'concert_goer_390']:
        return str(a)
    else:
        return 'other'
    
df_train['Concert Goer ID'] = df_train['Concert Goer ID'].map(convertConcertGoerID)
df_test['Concert Goer ID'] = df_test['Concert Goer ID'].map(convertConcertGoerID)
df_eval['Concert Goer ID'] = df_eval['Concert Goer ID'].map(convertConcertGoerID)

# Create New Feature

In [316]:
df_train['Out n Rain'] = ((df_train['Inside Venue'] == 0) & (df_train['Rain'] == 1)).astype(int)
df_test['Out n Rain'] = ((df_test['Inside Venue'] == 0) & (df_test['Rain'] == 1)).astype(int)
df_eval['Out n Rain'] = ((df_eval['Inside Venue'] == 0) & (df_eval['Rain'] == 1)).astype(int)

# Drop Cols

In [317]:
cols_to_drop = ['Id']
ids_test = df_test['Id']
df_train = df_train.drop(columns=cols_to_drop)
df_eval = df_eval.drop(columns=cols_to_drop)
df_test = df_test.drop(columns=cols_to_drop)

# Balance Dataset

from sklearn.metrics import DistanceMetric
from imblearn.over_sampling import SMOTENC

df_train['Concert Enjoyment'].value_counts()

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Enjoyed'
    ].sample(frac=0.25).index)

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Did Not Enjoy'
    ].sample(frac=0.25).index)

df_train['Concert Enjoyment'].value_counts()

nb_class_dict = {
    'Did Not Enjoy':40783,
    'Enjoyed':40769,
    'Worst Concert Ever':26000,
    'Best Concert Ever':26000
}
X = df_train.drop(columns=['Concert Enjoyment'])
y = df_train['Concert Enjoyment']
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
cat_cols_idx = [X.columns.get_loc(c) for c in cat_cols]

sm = SMOTENC(random_state=0, sampling_strategy=nb_class_dict, categorical_features=cat_cols_idx)

X_bal, y_bal = sm.fit_resample(X, y)

df_train = pd.concat([X_bal, y_bal], axis=1)

df_train['Concert Enjoyment'].value_counts()

# One Hot

In [318]:
one_hot_cols = ['Band Name', 'Concert Goer ID', 'Concert ID', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin']

In [319]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [320]:
X_train = df_train.drop(columns=['Concert Enjoyment'])
X_eval = df_eval.drop(columns=['Concert Enjoyment'])

y_train = df_train['Concert Enjoyment']
y_eval = df_eval['Concert Enjoyment']

In [321]:
df_test.columns.values

array(['Band Name', 'Band Genre', 'Band Country of Origin', 'Band Debut',
       'Concert ID', 'Concert Attendance', 'Inside Venue', 'Rain',
       'Seated', 'Personnality Trait 1', 'Personnality Trait 2',
       'Personnality Trait 3', 'Personnality Trait 4', 'Concert Goer Age',
       'Concert Goer ID', 'Height (cm)', 'Concert Goer Country of Origin',
       'Out n Rain'], dtype=object)

In [322]:
X_train = pd.get_dummies(X_train, columns = one_hot_cols)
X_eval = pd.get_dummies(X_eval, columns = one_hot_cols)
X_test = pd.get_dummies(df_test, columns = one_hot_cols)

In [323]:
X_test.columns

Index(['Band Debut', 'Concert Attendance', 'Inside Venue', 'Rain', 'Seated',
       'Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3',
       'Personnality Trait 4', 'Concert Goer Age',
       ...
       'Band Genre_pop music', 'Band Genre_rnb', 'Band Genre_rock n roll',
       'Band Country of Origin_canada',
       'Band Country of Origin_united kingdom (uk)',
       'Band Country of Origin_united states of america (usa)',
       'Concert Goer Country of Origin_canada',
       'Concert Goer Country of Origin_other',
       'Concert Goer Country of Origin_united kingdom (uk)',
       'Concert Goer Country of Origin_united states of america (usa)'],
      dtype='object', length=101)

In [324]:
X_train = X_train.reindex(sorted(X_train.columns), axis=1)
X_eval = X_eval.reindex(sorted(X_train.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)

In [325]:
# Pour verifier qu'ils sont dans le meme ordre
for i , c in enumerate(X_train.columns.values):
    if X_eval.columns.values[i] != c:
        print(c)

In [326]:
X_train

Unnamed: 0,Band Country of Origin_canada,Band Country of Origin_united kingdom (uk),Band Country of Origin_united states of america (usa),Band Debut,Band Genre_country,Band Genre_heavy metal,Band Genre_hip hop/rap,Band Genre_indie/alt rock,Band Genre_pop music,Band Genre_rnb,...,Concert ID_other,Height (cm),Inside Venue,Out n Rain,Personnality Trait 1,Personnality Trait 2,Personnality Trait 3,Personnality Trait 4,Rain,Seated
0,0,0,1,1970,0,0,0,1,0,0,...,1,140.0,0,0,0.330843,-0.958408,-0.943548,-1.636806,0,0
1,0,0,1,1960,0,0,0,0,1,0,...,1,158.0,1,0,-2.069449,0.017777,-1.910675,0.610265,0,1
2,1,0,0,1950,0,0,0,0,0,0,...,1,159.0,0,0,-0.484268,1.968772,-0.064167,-1.260871,0,1
3,0,0,1,1990,0,0,0,0,0,1,...,1,150.0,0,1,-0.858054,1.022827,-0.348389,-1.147251,1,0
4,0,0,1,1960,0,0,0,0,1,0,...,1,166.0,0,0,-0.793029,-1.166528,-0.043766,0.969661,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161495,0,0,1,1960,0,0,0,0,1,0,...,1,182.0,0,0,0.586789,-0.288211,-0.746254,-1.19351,0,0
161496,0,0,1,1970,0,0,0,0,0,0,...,1,154.0,1,0,0.659164,-1.074516,0.855201,2.426957,0,0
161497,0,1,0,1970,0,0,0,0,1,0,...,1,167.0,1,0,-0.208026,-1.716854,1.460769,0.166714,0,1
161498,0,0,1,1950,0,0,1,0,0,0,...,1,167.0,1,0,-0.270337,-3.120069,0.552467,0.000186,1,0


# Scale

In [327]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [328]:
sc = MinMaxScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_eval = sc.transform(X_eval)
X_test = sc.transform(X_test)

# Predictions

In [329]:
from sklearn.neighbors import KNeighborsClassifier

In [330]:
knn = KNeighborsClassifier(n_neighbors=27, weights='uniform', p=1, algorithm='kd_tree')

In [331]:
from sklearn.preprocessing import LabelEncoder

In [332]:
le = LabelEncoder().fit(y_train)
y_train_encoded = le.transform(y_train)
y_eval_encoded = le.transform(y_eval)

In [333]:
knn.fit(X_train, y_train_encoded)

In [334]:
y_eval_pred = knn.predict(X_eval)

In [335]:
y_eval_pred

array([1, 1, 3, ..., 2, 1, 2])

In [336]:
from sklearn.metrics import accuracy_score, f1_score

acc = f1_score(y_eval_encoded, y_eval_pred, average=None)

print(acc)

[0.33869602 0.67425083 0.67361778 0.45688351]


In [337]:
acc = f1_score(y_eval_encoded, y_eval_pred, average='micro')
print(acc)

0.6337647058823529


In [338]:
from sklearn.metrics import classification_report

print(dict(zip(le.classes_, le.transform(le.classes_))))
print(classification_report(y_eval_encoded, y_eval_pred))

{'Best Concert Ever': 0, 'Did Not Enjoy': 1, 'Enjoyed': 2, 'Worst Concert Ever': 3}
              precision    recall  f1-score   support

           0       0.56      0.24      0.34       827
           1       0.64      0.72      0.67      3387
           2       0.63      0.72      0.67      3409
           3       0.68      0.34      0.46       877

    accuracy                           0.63      8500
   macro avg       0.63      0.51      0.54      8500
weighted avg       0.63      0.63      0.62      8500



# Test Predictions

In [339]:
y_pred_test = knn.predict(X_test)

In [340]:
y_pred_test = le.inverse_transform(y_pred_test)

In [341]:
y_pred_test

array(['Best Concert Ever', 'Enjoyed', 'Enjoyed', ..., 'Enjoyed',
       'Did Not Enjoy', 'Worst Concert Ever'], dtype=object)

In [342]:
ids_test.values

array(['ConcertExperience_70055', 'ConcertExperience_34799',
       'ConcertExperience_100410', ..., 'ConcertExperience_197434',
       'ConcertExperience_166029', 'ConcertExperience_24025'],
      dtype=object)

In [343]:
df_soumission = pd.DataFrame({'Id' : ids_test.values, 'Predicted' : y_pred_test})

In [346]:
df_soumission.to_csv('Soumission3.csv', index=False)