1. Get train data and test data
2. Split train data (train, eval) right away to simulate train vs test data
3. Handle missing values (na and 'insert')
4. Handle outliers


In [None]:
%pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import os
from scipy import stats
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape)
print(df_test.shape)

(170000, 19)
(30000, 18)


In [None]:

def convertToNum(a):
    if a == 'Worst Concert Ever':
        return 0
    elif a == 'Did Not Enjoy':
        return 1
    elif a == 'Enjoyed':
        return 2
    else:
        return 3
    

#df_train['Num Concert Enjoyment'] = df_train['Concert Enjoyment'].replace(
#    ['Worst Concert Ever', 'Did Not Enjoy', 'Enjoyed', 'Best Concert Ever'], 
#    [0,1,2,3]
#)

In [None]:
df_train, df_eval = np.split(df_train, [int(0.85*len(df_train))])
print(df_train.shape)
print(df_eval.shape)

(144500, 19)
(25500, 19)


# To Lower and To String

In [None]:
def toLower(a):
    if a == None or a != a:
        pass
    else:
        a = a.lower()
    return a

def toString(a):
  if a == None or a!=a :    #if a is none or nan on pass 
    pass 
  else:
    a = str(int(a))   
  return a

for c in ["Band Country of Origin","Band Name","Band Genre","Concert Goer Country of Origin"]:
    df_train[c] = df_train[c].map(toLower)
    df_eval[c] = df_eval[c].map(toLower)
    df_test[c] = df_test[c].map(toLower)
    
for c in ['Concert ID']:
    df_train[c] = df_train[c].map(toString)
    df_eval[c] = df_eval[c].map(toString)
    df_test[c] = df_test[c].map(toString)

# By Decile

In [None]:
def groupByDecile(a):
  if a == None or a!=a :     
    pass 
  elif a< 20:
    a = str(16)     #ca permet de faire commencer à 16 ans l'intervalle des âges sinon ce serait 10 ans X(
  else :
    a = a - a%10        #  Ex : 1976 - 1976%10 = 1970
    a = str(int(a))
  return a

for c in ["Band Debut","Concert Goer Age"]:
    df_train[c] = df_train[c].map(groupByDecile)
    df_eval[c] = df_eval[c].map(groupByDecile)
    df_test[c] = df_test[c].map(groupByDecile)

# Handle Missing values

In [None]:
# Missing values are nan or '*insert*'
# Want to replace by most popular
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train = df_train.drop(columns=['Concert Enjoyment'])
y_train = df_train['Concert Enjoyment']

X_eval = df_eval.drop(columns=['Concert Enjoyment'])
y_eval = df_eval['Concert Enjoyment']

idx_train = X_train.index
idx_eval = X_eval.index

imp.fit(X_train)

cols = X_train.columns.values

X_train = imp.transform(X_train)
X_eval = imp.transform(X_eval)
X_test = imp.transform(df_test)

# Quand soumission on train sur TOUT train
df_train = pd.concat([pd.DataFrame(X_train, columns=cols, index=idx_train), y_train], axis=1)
df_eval = pd.concat([pd.DataFrame(X_eval, columns=cols, index=idx_eval), y_eval], axis=1)
df_test = pd.DataFrame(X_test, columns=cols)


# To Int

In [None]:
cols_to_int = ['Inside Venue','Rain','Seated']
for c in cols_to_int:
    df_train[c] = df_train[c].astype(int)
    df_eval[c] = df_eval[c].astype(int)
    # Quand soummission
    df_test[c] = df_test[c].astype(int)

In [None]:
cols_with_insert = ['Band Name', 'Band Genre', 'Band Country of Origin','Concert Goer Country of Origin']
for c in cols_with_insert:
    df_train.loc[df_train[c].str.contains('insert'), c] = df_train[c].mode()[0]
    df_eval.loc[df_eval[c].str.contains('insert'), c] = df_train[c].mode()[0]
    # Quand soumission on fait tout sur le train mode
    df_test.loc[df_test[c].str.contains('insert'), c] = df_test[c].mode()[0]

# Handle Outliers

In [None]:
def convertConcertAttendance(a):
    if int(a) > 162754:
        a = (a / 10000)
    return a

df_train['Concert Attendance'] = df_train['Concert Attendance'].map(convertConcertAttendance)
df_eval['Concert Attendance'] = df_eval['Concert Attendance'].map(convertConcertAttendance)
# Quand soumission
df_test['Concert Attendance'] = df_test['Concert Attendance'].map(convertConcertAttendance)

In [None]:
df_train.loc[
    (df_train['Personnality Trait 2'] < -5) |
    (df_train['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

df_eval.loc[
    (df_eval['Personnality Trait 2'] < -5) |
    (df_eval['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()


df_test.loc[
    (df_test['Personnality Trait 2'] < -5) |
    (df_test['Personnality Trait 2'] > 5), 'Personnality Trait 2'
] = df_train['Personnality Trait 2'].mean()

# Quand soumission on le fait sur df_test

# Group Data

In [None]:
def groupCountry(a):
    if a not in ['canada', 'united states of america (usa)', 'united kingdom (uk)']:
        a = 'other'
    return a

df_train['Concert Goer Country of Origin'] = df_train['Concert Goer Country of Origin'].map(groupCountry)
df_eval['Concert Goer Country of Origin'] = df_eval['Concert Goer Country of Origin'].map(groupCountry)
# quand soumission sur test aussi
df_test['Concert Goer Country of Origin'] = df_test['Concert Goer Country of Origin'].map(groupCountry)

# Keep Important Concert IDs

In [None]:
print(df_train['Concert ID'].value_counts())
print(df_test['Concert ID'].value_counts())
print(df_eval['Concert ID'].value_counts())

626    895
537    162
717    162
289    162
71     161
      ... 
627    127
450    126
918    126
997    125
287    123
Name: Concert ID, Length: 1000, dtype: int64
626    183
342     46
997     45
615     45
101     45
      ... 
159     18
576     18
773     17
464     17
136     16
Name: Concert ID, Length: 1000, dtype: int64
626    150
117     45
847     41
794     40
627     38
      ... 
738     14
513     14
534     14
289     14
537     13
Name: Concert ID, Length: 1000, dtype: int64


In [None]:
def convertConcertID(a):
    # Top 5 meilleurs average rating qui sont a l'interieur, Pire 5 qui sont a l'intérieur
    if int(a) in [879, 539, 456, 961, 592, 978, 441, 193, 24, 164]:
        return str(a)
    else:
        return 'other'
    
df_train['Concert ID'] = df_train['Concert ID'].map(convertConcertID)
df_test['Concert ID'] = df_test['Concert ID'].map(convertConcertID)
df_eval['Concert ID'] = df_eval['Concert ID'].map(convertConcertID)

# Keep Important Concert Goer IDs

In [None]:
print(df_train['Concert Goer ID'].value_counts())
print(df_test['Concert Goer ID'].value_counts())
print(df_eval['Concert Goer ID'].value_counts())

concert_goer_1502    812
concert_goer_1501    100
concert_goer_1415     98
concert_goer_255      98
concert_goer_559      97
                    ... 
concert_goer_1737     50
concert_goer_903      49
concert_goer_639      49
concert_goer_1081     47
concert_goer_861      44
Name: Concert Goer ID, Length: 2000, dtype: int64
concert_goer_1502    159
concert_goer_1833     29
concert_goer_1722     27
concert_goer_1193     27
concert_goer_854      27
                    ... 
concert_goer_553       5
concert_goer_1737      5
concert_goer_1643      4
concert_goer_864       4
concert_goer_112       3
Name: Concert Goer ID, Length: 2000, dtype: int64
concert_goer_1502    112
concert_goer_202      25
concert_goer_969      25
concert_goer_1351     25
concert_goer_1931     24
                    ... 
concert_goer_1910      4
concert_goer_1413      4
concert_goer_185       4
concert_goer_1408      4
concert_goer_1467      3
Name: Concert Goer ID, Length: 2000, dtype: int64


In [None]:
def convertConcertGoerID(a):
    if str(a) in ['concert_goer_695', 'concert_goer_413', 'concert_goer_1392', 'concert_goer_398', 'concert_goer_1770', 'concert_goer_1011', 'concert_goer_1414', 'concert_goer_1419', 'concert_goer_1506', 'concert_goer_390']:
        return str(a)
    else:
        return 'other'
    
df_train['Concert Goer ID'] = df_train['Concert Goer ID'].map(convertConcertGoerID)
df_test['Concert Goer ID'] = df_test['Concert Goer ID'].map(convertConcertGoerID)
df_eval['Concert Goer ID'] = df_eval['Concert Goer ID'].map(convertConcertGoerID)

# Create New Feature

In [None]:
df_train['Out n Rain'] = ((df_train['Inside Venue'] == 0) & (df_train['Rain'] == 1)).astype(int)
df_test['Out n Rain'] = ((df_test['Inside Venue'] == 0) & (df_test['Rain'] == 1)).astype(int)
df_eval['Out n Rain'] = ((df_eval['Inside Venue'] == 0) & (df_eval['Rain'] == 1)).astype(int)

# Drop Cols

In [None]:
df_test.columns

Index(['Id', 'Band Name', 'Band Genre', 'Band Country of Origin', 'Band Debut',
       'Concert ID', 'Concert Attendance', 'Inside Venue', 'Rain', 'Seated',
       'Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3',
       'Personnality Trait 4', 'Concert Goer Age', 'Concert Goer ID',
       'Height (cm)', 'Concert Goer Country of Origin', 'Out n Rain'],
      dtype='object')

In [None]:
cols_to_drop = ['Id']
ids_test = df_test['Id']
df_train = df_train.drop(columns=cols_to_drop)
df_eval = df_eval.drop(columns=cols_to_drop)
df_test = df_test.drop(columns=cols_to_drop)

# Balance Dataset

from sklearn.metrics import DistanceMetric
from imblearn.over_sampling import SMOTENC

df_train['Concert Enjoyment'].value_counts()

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Enjoyed'
    ].sample(frac=0.25).index)

df_train = df_train.drop(
    df_train[
        df_train['Concert Enjoyment'] == 'Did Not Enjoy'
    ].sample(frac=0.25).index)

df_train['Concert Enjoyment'].value_counts()

nb_class_dict = {
    'Did Not Enjoy':40783,
    'Enjoyed':40769,
    'Worst Concert Ever':26000,
    'Best Concert Ever':26000
}
X = df_train.drop(columns=['Concert Enjoyment'])
y = df_train['Concert Enjoyment']
cols = X.columns
num_cols = X._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
cat_cols_idx = [X.columns.get_loc(c) for c in cat_cols]

sm = SMOTENC(random_state=0, sampling_strategy=nb_class_dict, categorical_features=cat_cols_idx)

X_bal, y_bal = sm.fit_resample(X, y)

df_train = pd.concat([X_bal, y_bal], axis=1)

df_train['Concert Enjoyment'].value_counts()

# One Hot

In [None]:
one_hot_cols = ['Band Name', 'Concert Goer ID', 'Concert ID', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [None]:
X_train = df_train.drop(columns=['Concert Enjoyment'])
X_eval = df_eval.drop(columns=['Concert Enjoyment'])

y_train = df_train['Concert Enjoyment']
y_eval = df_eval['Concert Enjoyment']

In [None]:
df_test.columns.values

array(['Band Name', 'Band Genre', 'Band Country of Origin', 'Band Debut',
       'Concert ID', 'Concert Attendance', 'Inside Venue', 'Rain',
       'Seated', 'Personnality Trait 1', 'Personnality Trait 2',
       'Personnality Trait 3', 'Personnality Trait 4', 'Concert Goer Age',
       'Concert Goer ID', 'Height (cm)', 'Concert Goer Country of Origin',
       'Out n Rain'], dtype=object)

In [None]:
X_train = pd.get_dummies(X_train, columns = one_hot_cols)
X_eval = pd.get_dummies(X_eval, columns = one_hot_cols)
X_test = pd.get_dummies(df_test, columns = one_hot_cols)

In [None]:
X_test.columns

Index(['Band Debut', 'Concert Attendance', 'Inside Venue', 'Rain', 'Seated',
       'Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3',
       'Personnality Trait 4', 'Concert Goer Age',
       ...
       'Band Genre_pop music', 'Band Genre_rnb', 'Band Genre_rock n roll',
       'Band Country of Origin_canada',
       'Band Country of Origin_united kingdom (uk)',
       'Band Country of Origin_united states of america (usa)',
       'Concert Goer Country of Origin_canada',
       'Concert Goer Country of Origin_other',
       'Concert Goer Country of Origin_united kingdom (uk)',
       'Concert Goer Country of Origin_united states of america (usa)'],
      dtype='object', length=101)

In [None]:
X_train = X_train.reindex(sorted(X_train.columns), axis=1)
X_eval = X_eval.reindex(sorted(X_train.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)

In [None]:
# Pour verifier qu'ils sont dans le meme ordre
for i , c in enumerate(X_train.columns.values):
    if X_eval.columns.values[i] != c:
        print(c)

In [None]:
colonnesXTrain = X_train.columns

# Scale

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
sc = MinMaxScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_eval = sc.transform(X_eval)
X_test = sc.transform(X_test)

In [None]:
X_train[:,:]

array([[0.        , 0.        , 1.        , ..., 0.23192147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.54757852, 0.        ,
        1.        ],
       [1.        , 0.        , 0.        , ..., 0.2847309 , 0.        ,
        1.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.60023167, 1.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.64166392, 1.        ,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.43430479, 1.        ,
        0.        ]])

# Predictions

In [None]:
from catboost import CatBoostClassifier

In [None]:
#cat = CatBoostClassifier(random_state=1,silent=True, l2_leaf_reg= 3, learning_rate= 0.15) train  0.6983114186851211    eval  0.6637254901960784
#cat = CatBoostClassifier(random_state=1,silent=True, l2_leaf_reg= 3, learning_rate= 0.1)    # 0.6625098039215687
cat = CatBoostClassifier(random_state=25)  


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder().fit(y_train)
y_train_encoded = le.transform(y_train)
y_eval_encoded = le.transform(y_eval)

In [None]:
%matplotlib inline
cat.fit(X_train, y_train_encoded,silent=True,plot=True)

In [None]:
y_eval_pred = cat.predict(X_eval)
y_train_pred = cat.predict(X_train)


In [None]:
from sklearn.metrics import accuracy_score, f1_score,confusion_matrix
acc = f1_score(y_train_encoded, y_train_pred, average=None)

print("train ",acc)
acc = f1_score(y_eval_encoded, y_eval_pred, average=None)

print("eval ",acc)

train  [0.44069431 0.72181641 0.71942029 0.54861384]
eval  [0.39754327 0.7011505  0.7030209  0.51091703]


In [None]:
acc = f1_score(y_train_encoded, y_train_pred, average='micro')
print("train ",acc)
acc = f1_score(y_eval_encoded, y_eval_pred, average='micro')
print("eval ",acc)

train  0.6868858131487889
eval  0.6652549019607843


In [None]:
from sklearn.metrics import classification_report

print(dict(zip(le.classes_, le.transform(le.classes_))))
print(classification_report(y_eval_encoded, y_eval_pred))

{'Best Concert Ever': 0, 'Did Not Enjoy': 1, 'Enjoyed': 2, 'Worst Concert Ever': 3}
              precision    recall  f1-score   support

           0       0.68      0.28      0.40      2528
           1       0.67      0.74      0.70     10149
           2       0.66      0.75      0.70     10230
           3       0.69      0.41      0.51      2593

    accuracy                           0.67     25500
   macro avg       0.67      0.54      0.58     25500
weighted avg       0.67      0.67      0.65     25500



In [None]:
confusion_matrix(y_eval_encoded, y_eval_pred)

array([[ 712,   33, 1783,    0],
       [   6, 7496, 2182,  465],
       [ 336, 2180, 7703,   11],
       [   0, 1524,   16, 1053]])

# Test Predictions

In [None]:
y_pred_test = cat.predict(X_test)

In [None]:
y_pred_test = le.inverse_transform(y_pred_test)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred_test

array(['Best Concert Ever', 'Enjoyed', 'Enjoyed', ..., 'Enjoyed',
       'Worst Concert Ever', 'Worst Concert Ever'], dtype=object)

In [None]:
ids_test.values

array(['ConcertExperience_70055', 'ConcertExperience_34799',
       'ConcertExperience_100410', ..., 'ConcertExperience_197434',
       'ConcertExperience_166029', 'ConcertExperience_24025'],
      dtype=object)

In [None]:
df_soumission = pd.DataFrame({'Id' : ids_test.values, 'Predicted' : y_pred_test})

In [None]:
df_soumission.to_csv('Soumission8.csv', index=False)

In [None]:
df_soumission

Unnamed: 0,Id,Predicted
0,ConcertExperience_70055,Best Concert Ever
1,ConcertExperience_34799,Enjoyed
2,ConcertExperience_100410,Enjoyed
3,ConcertExperience_106446,Enjoyed
4,ConcertExperience_127249,Did Not Enjoy
...,...,...
29995,ConcertExperience_82288,Did Not Enjoy
29996,ConcertExperience_27139,Enjoyed
29997,ConcertExperience_197434,Enjoyed
29998,ConcertExperience_166029,Worst Concert Ever
