1. Get train data and test data
2. Split train data (train, eval) right away to simulate train vs test data
3. Handle missing values (na and 'insert')
4. Handle outliers

In [1]:
import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import os
from scipy import stats
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape)
print(df_test.shape)

(170000, 19)
(30000, 18)


In [3]:
df_train, df_eval = np.split(df_train, [int(0.80*len(df_train))])
print(df_train.shape)
print(df_eval.shape)

(136000, 19)
(34000, 19)


# To Lower and To String

In [4]:
def toLower(a):
    if a == None or a != a:
        pass
    else:
        a = a.lower()
    return a

def toString(a):
  if a == None or a!=a :    #if a is none or nan on pass 
    pass 
  else:
    a = str(int(a))   
  return a

for c in ["Band Country of Origin","Band Name","Band Genre","Concert Goer Country of Origin"]:
    df_train[c] = df_train[c].map(toLower)
    df_eval[c] = df_eval[c].map(toLower)
    df_test[c] = df_test[c].map(toLower)
    
for c in ['Concert ID']:
    df_train[c] = df_train[c].map(toString)
    df_eval[c] = df_eval[c].map(toString)
    df_test[c] = df_test[c].map(toString)

# By Decile

In [5]:
def groupByDecile(a):
  if a == None or a!=a :     
    pass 
  elif a< 20:
    a = str(16)     #ca permet de faire commencer à 16 ans l'intervalle des âges sinon ce serait 10 ans X(
  else :
    a = a - a%10        #  Ex : 1976 - 1976%10 = 1970
    a = str(int(a))
  return a

for c in ["Band Debut","Concert Goer Age"]:
    df_train[c] = df_train[c].map(groupByDecile)
    df_eval[c] = df_eval[c].map(groupByDecile)
    df_test[c] = df_test[c].map(groupByDecile)

# Handle Missing values

In [6]:
# Missing values are nan or '*insert*'
# Want to replace by most popular
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train = df_train.drop(columns=['Concert Enjoyment'])
y_train = df_train['Concert Enjoyment']

X_eval = df_eval.drop(columns=['Concert Enjoyment'])
y_eval = df_eval['Concert Enjoyment']

idx_train = X_train.index
idx_eval = X_eval.index

imp.fit(X_train)

cols = X_train.columns.values

X_train = imp.transform(X_train)
X_eval = imp.transform(X_eval)
X_test = imp.transform(df_test)

# Quand soumission on train sur TOUT train
df_train = pd.concat([pd.DataFrame(X_train, columns=cols, index=idx_train), y_train], axis=1)
df_eval = pd.concat([pd.DataFrame(X_eval, columns=cols, index=idx_eval), y_eval], axis=1)
df_test = pd.DataFrame(X_test, columns=cols)


# To Int

In [7]:
cols_to_int = ['Inside Venue','Rain','Seated']
for c in cols_to_int:
    df_train[c] = df_train[c].astype(int)
    df_eval[c] = df_eval[c].astype(int)
    # Quand soummission
    df_test[c] = df_test[c].astype(int)

In [8]:
cols_with_insert = ['Band Name', 'Band Genre', 'Band Country of Origin','Concert Goer Country of Origin']
for c in cols_with_insert:
    df_train.loc[df_train[c].str.contains('insert'), c] = df_train[c].mode()[0]
    df_eval.loc[df_eval[c].str.contains('insert'), c] = df_train[c].mode()[0]
    # Quand soumission on fait tout sur le train mode
    df_test.loc[df_test[c].str.contains('insert'), c] = df_test[c].mode()[0]

# Handle Outliers

In [9]:
def convertConcertAttendance(a):
    if int(a) > 162754:
        a = (a / 10000)
    return a

df_train['Concert Attendance'] = df_train['Concert Attendance'].map(convertConcertAttendance)
df_eval['Concert Attendance'] = df_eval['Concert Attendance'].map(convertConcertAttendance)
# Quand soumission
df_test['Concert Attendance'] = df_test['Concert Attendance'].map(convertConcertAttendance)

In [10]:
df_train.loc[
    (df_train['Personnality Trait 2'] < -5) |
    (df_train['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

df_eval.loc[
    (df_eval['Personnality Trait 2'] < -5) |
    (df_eval['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()


df_test.loc[
    (df_test['Personnality Trait 2'] < -5) |
    (df_test['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

# Quand soumission on le fait sur df_test

# Group Data

In [11]:
def groupCountry(a):
    if a not in ['canada', 'united states of america (usa)', 'united kingdom (uk)']:
        a = 'other'
    return a

df_train['Concert Goer Country of Origin'] = df_train['Concert Goer Country of Origin'].map(groupCountry)
df_eval['Concert Goer Country of Origin'] = df_eval['Concert Goer Country of Origin'].map(groupCountry)
# quand soumission sur test aussi
df_test['Concert Goer Country of Origin'] = df_test['Concert Goer Country of Origin'].map(groupCountry)

# Drop Cols

In [12]:
cols_to_drop = ['Id', 'Concert ID', 'Concert Goer ID']
ids_test = df_test['Id']
df_train = df_train.drop(columns=cols_to_drop)
df_eval = df_eval.drop(columns=cols_to_drop)
df_test = df_test.drop(columns=cols_to_drop)

# Balance Dataset

from sklearn.metrics import DistanceMetric
from imblearn.over_sampling import SMOTENC

df_train['Concert Enjoyment'].value_counts()

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Enjoyed'
    ].sample(frac=0.25).index)

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Did Not Enjoy'
    ].sample(frac=0.25).index)

df_train['Concert Enjoyment'].value_counts()

nb_class_dict = {
    'Did Not Enjoy':40783,
    'Enjoyed':40769,
    'Worst Concert Ever':26000,
    'Best Concert Ever':26000
}
X = df_train.drop(columns=['Concert Enjoyment'])
y = df_train['Concert Enjoyment']
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
cat_cols_idx = [X.columns.get_loc(c) for c in cat_cols]

sm = SMOTENC(random_state=0, sampling_strategy=nb_class_dict, categorical_features=cat_cols_idx)

X_bal, y_bal = sm.fit_resample(X, y)

df_train = pd.concat([X_bal, y_bal], axis=1)

df_train['Concert Enjoyment'].value_counts()

# One Hot

In [13]:
one_hot_cols = ['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin']

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [15]:
X_train = df_train.drop(columns=['Concert Enjoyment'])
X_eval = df_eval.drop(columns=['Concert Enjoyment'])

y_train = df_train['Concert Enjoyment']
y_eval = df_eval['Concert Enjoyment']

In [16]:
df_test.columns.values

array(['Band Name', 'Band Genre', 'Band Country of Origin', 'Band Debut',
       'Concert Attendance', 'Inside Venue', 'Rain', 'Seated',
       'Personnality Trait 1', 'Personnality Trait 2',
       'Personnality Trait 3', 'Personnality Trait 4', 'Concert Goer Age',
       'Height (cm)', 'Concert Goer Country of Origin'], dtype=object)

In [17]:
X_train = pd.get_dummies(X_train, columns = one_hot_cols)
X_eval = pd.get_dummies(X_eval, columns = one_hot_cols)
X_test = pd.get_dummies(df_test, columns = one_hot_cols)

In [18]:
X_test.columns

Index(['Band Debut', 'Concert Attendance', 'Inside Venue', 'Rain', 'Seated',
       'Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3',
       'Personnality Trait 4', 'Concert Goer Age', 'Height (cm)',
       'Band Name_a dozen running iced frogs', 'Band Name_allo knickers',
       'Band Name_beyond devon', 'Band Name_big autumn',
       'Band Name_big division', 'Band Name_big frogs dream',
       'Band Name_big joystick', 'Band Name_crazy crazy',
       'Band Name_crazy dream and a pinch of joystick',
       'Band Name_crazy frogs of devon', 'Band Name_crazy joystick cult',
       'Band Name_crazy tooth', 'Band Name_crazyplay',
       'Band Name_das joystick', 'Band Name_devon revival',
       'Band Name_devon thunder thighs', 'Band Name_disciples of devon',
       'Band Name_flight of the knickers', 'Band Name_frogsica',
       'Band Name_it's my joystick', 'Band Name_joystick a dozen',
       'Band Name_joystick army', 'Band Name_joystick attack',
       'Band N

In [19]:
X_train = X_train.reindex(sorted(X_train.columns), axis=1)
X_eval = X_eval.reindex(sorted(X_train.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)

In [20]:
# Pour verifier qu'ils sont dans le meme ordre
for i , c in enumerate(X_train.columns.values):
    if X_test.columns.values[i] != c:
        print(c)

In [21]:
X_train

Unnamed: 0,Band Country of Origin_canada,Band Country of Origin_united kingdom (uk),Band Country of Origin_united states of america (usa),Band Debut,Band Genre_country,Band Genre_heavy metal,Band Genre_hip hop/rap,Band Genre_indie/alt rock,Band Genre_pop music,Band Genre_rnb,...,Concert Goer Country of Origin_united kingdom (uk),Concert Goer Country of Origin_united states of america (usa),Height (cm),Inside Venue,Personnality Trait 1,Personnality Trait 2,Personnality Trait 3,Personnality Trait 4,Rain,Seated
0,0,0,1,1970,0,0,0,1,0,0,...,0,0,140.0,0,0.330843,-0.958408,-0.943548,-1.636806,0,0
1,0,0,1,1960,0,0,0,0,1,0,...,1,0,158.0,1,-2.069449,0.017777,-1.910675,0.610265,0,1
2,1,0,0,1950,0,0,0,0,0,0,...,0,1,159.0,0,-0.484268,1.968772,-0.064167,-1.260871,0,1
3,0,0,1,1990,0,0,0,0,0,1,...,0,0,150.0,0,-0.858054,1.022827,-0.348389,-1.147251,1,0
4,0,0,1,1960,0,0,0,0,1,0,...,1,0,166.0,0,-0.793029,-1.166528,-0.043766,0.969661,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135995,0,0,1,1960,0,0,0,0,0,1,...,0,1,157.0,1,-0.156783,-1.517616,-1.116404,1.890379,0,0
135996,0,0,1,1990,0,1,0,0,0,0,...,0,0,140.0,0,-0.726809,1.155656,-0.756992,1.371059,1,0
135997,0,0,1,1990,0,1,0,0,0,0,...,0,1,170.0,0,1.172671,-0.298887,0.994301,0.839205,1,1
135998,0,0,1,1980,0,1,0,0,0,0,...,0,1,180.0,0,-0.780775,0.097951,0.295827,-0.381833,1,1


# Scale

In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [23]:
sc = MinMaxScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_eval = sc.transform(X_eval)
X_test = sc.transform(X_test)

# Predictions

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn = KNeighborsClassifier(n_neighbors=27, weights='uniform', p=1, algorithm='kd_tree')

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
le = LabelEncoder().fit(y_train)
y_train_encoded = le.transform(y_train)
y_eval_encoded = le.transform(y_eval)

In [28]:
knn.fit(X_train, y_train_encoded)

In [29]:
y_eval_pred = knn.predict(X_eval)

In [30]:
y_eval_pred

array([2, 2, 2, ..., 2, 2, 2])

In [31]:
from sklearn.metrics import accuracy_score, f1_score

acc = f1_score(y_eval_encoded, y_eval_pred, average=None)

print(acc)

[0.33185373 0.67814937 0.67505643 0.44934261]


In [32]:
acc = f1_score(y_eval_encoded, y_eval_pred, average='micro')
print(acc)

0.6353235294117647


In [None]:
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit

X = np.concatenate([X_train, X_eval])
y = np.concatenate([y_train, y_eval])

knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors' : [21, 23, 25, 27],
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['ball_tree', 'kd_tree'],
    'p' : [1,2]
}

rand = RandomizedSearchCV(knn, param_grid, cv=ShuffleSplit(n_splits=1), scoring='f1_micro', n_iter=10, random_state=1, verbose=10)

rand.fit(X, y)   

# Test Predictions

In [None]:
y_pred_test = knn.predict(X_test)

In [None]:
y_pred_test = le.inverse_transform(y_pred_test)

In [None]:
y_pred_test

In [None]:
ids_test.values

In [None]:
df_soumission = pd.DataFrame({'Id' : ids_test.values, 'Predicted' : y_pred_test})

In [None]:
df_soumission.to_csv('Soumission2.csv', index=False)