# **1 . Set up environment**

First, install all the packages needed

In [41]:
!pip install -q kaggle
from google.colab import files
! rm kaggle.json
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove 'kaggle.json': No such file or directory


mkdir: cannot create directory ‘/root/.kaggle’: File exists
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [42]:
!kaggle competitions download -c inf8245e-fall-2022
!unzip inf8245e-fall-2022.zip
!ls

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.
unzip:  cannot find or open inf8245e-fall-2022.zip, inf8245e-fall-2022.zip.zip or inf8245e-fall-2022.zip.ZIP.
sample_data  test.csv  train.csv


Import all packages necessary

In [43]:
# Global
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string 

from sklearn.metrics import f1_score, recall_score, accuracy_score, classification_report, confusion_matrix

# Preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

import imblearn

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

Load the train data

In [44]:
data_train = pd.read_csv('train.csv', na_values=['Insert Band Name', 'Insert Band Genre', 'Insert Band Country of Origin'])

In [None]:
print(data_train['Band Name'].unique())
# Set "Insert Band Name" to NaN value
print(data_train['Band Genre'].unique())
#Set "Insert Band Genre" to NaN value
print(data_train['Band Country of Origin'].unique())
#Set "Insert Band Country of Origin" to NaN value
print(data_train['Concert Goer ID'].unique())
#Any not filled value
print(data_train['Concert Goer Country of Origin'].unique())
#Any not filled value

In [None]:
data_train

Load the test data

In [47]:
data_test = pd.read_csv('test.csv', na_values=['Insert Band Name', 'Insert Band Genre', 'Insert Band Country of Origin'])
#data_test

In [None]:
print(data_test['Band Name'].unique())
# Set "Insert Band Name" to NaN value
print(data_test['Band Genre'].unique())
#Set "Insert Band Genre" to NaN value
print(data_test['Band Country of Origin'].unique())
#Set "Insert Band Country of Origin" to NaN value
print(data_test['Concert Goer ID'].unique())
#Any not filled value
print(data_test['Concert Goer Country of Origin'].unique())
#Any not filled value

Defining data variables

In [49]:
X_train = data_train.drop(columns=['Concert Enjoyment', 'Id'])
y_train = data_train['Concert Enjoyment']

X_test = data_test.drop(columns=['Id'])

# **2 . Pre-processing phase**

In [50]:
# Maybe we could shuffle the data but not sure it's useful
# We could try to cut the over represented data to have a balanced dataset

# Split dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


#We could try to balance the training set to prevent overfitting problems
# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html

# https://seaborn.pydata.org/generated/seaborn.heatmap.html

Encode categorical columns with label encoder

In [51]:
enc = LabelEncoder()
categ_features = ['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin','Concert Goer ID']
for feature in categ_features:
  enc.fit(X_train[feature])
  le_dict = dict(zip(enc.classes_, enc.transform(enc.classes_)))
  X_train[feature] = X_train[feature].apply(lambda x: le_dict.get(x, '<unknown_value>'))
  X_test[feature] = X_test[feature].apply(lambda x: le_dict.get(x, '<unknown_value>'))


In [None]:
X_train

One hot encoding to erase the artificial distance created by label encoding -> n'améliore pas les résultats 😢

In [53]:
df=pd.get_dummies(X_train[categ_features])
X_train = pd.concat([df.reset_index(),X_train.drop(columns=categ_features).reset_index()], axis=1)

In [54]:
df=pd.get_dummies(X_test[categ_features])
X_test = pd.concat([df.reset_index(),X_test.drop(columns=categ_features).reset_index()], axis=1)

In [55]:
ohe = OneHotEncoder()
#Training dataset
X1 = X_train[categ_features[:4]].to_numpy()
X1 = ohe.fit_transform(X1).toarray()
new_features1 = [pd.DataFrame(X1),X_train.drop(columns=categ_features[:4]).reset_index()]
X_train = pd.concat(new_features1, axis = 1)
#Test dataset
X2 = X_test[categ_features[:4]].to_numpy()
X2 = ohe.fit_transform(X2).toarray()
new_features2 = [pd.DataFrame(X2),X_test.drop(columns=categ_features[:4]).reset_index()]
X_test = pd.concat(new_features2, axis = 1)


Filtering Nan values for boolean features, by default = 0

In [56]:
for name in ['Inside Venue','Rain','Seated']:
  X_train[name].fillna(0, inplace = True)
  X_test[name].fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Filtering NaN values for numerical features

In [57]:
for name in ['Band Debut',	'Concert ID',	'Concert Attendance', 'Personnality Trait 1',	'Personnality Trait 2',	\
                        'Personnality Trait 3',	'Personnality Trait 4',	'Concert Goer Age',	'Height (cm)']:
  X_train[name].fillna(X_train[name].mean(), inplace = True)
  X_test[name].fillna(X_test[name].mean(), inplace = True)
# clip outliers https://www.tutorialspoint.com/write-a-python-program-to-trim-the-minimum-and-maximum-threshold-value-in-a-dataframe
#sub2['income'].fillna((sub2['income'].mean()), inplace=True)
#instead of doing this, we could use complete case analysis, which means considering only data without missing data

Filtering Nan values for categorical features, take automatically the previous value in the column

In [66]:
for name in ['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin','Concert Goer ID']:
  X_train[name].fillna(method="ffill", inplace = True)
  X_test[name].fillna(method="ffill", inplace = True)

KeyError: ignored

Get rid of outliers -> ne semble pas apporter d'amélioration


In [59]:
for name in ['Concert Attendance', 	'Personnality Trait 2']:
  X_train[name] = np.clip(X_train[name],X_train[name].mean()-3*X_train[name].std() ,X_train[name].mean()+3*X_train[name].std())
  X_test[name] = np.clip(X_test[name],X_test[name].mean()-3*X_test[name].std() ,X_test[name].mean()+3*X_test[name].std())
#doesn't seem to have an impact on the result

Normalize the data, using StandardScaler or RobustScaler(problème les valeurs ne sont pas entre 0 et 1)

In [60]:
num_features = ['Band Debut',	'Concert ID',	'Concert Attendance', 'Personnality Trait 1',	'Personnality Trait 2',	\
                        'Personnality Trait 3',	'Personnality Trait 4',	'Concert Goer Age',	'Height (cm)']
#sc = RobustScaler(quantile_range = (0.25,0.75))
sc = StandardScaler()
X_train[num_features] = sc.fit_transform(X_train[num_features])

Principle Component Analysis

In [62]:
from sklearn.decomposition import PCA

pca = PCA(n_components=12)

X_train = pca.fit_transform(X_train)



ValueError: ignored

Using SMOTE to balance the data

In [63]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(k_neighbors=5)
X_train, y_train = oversample.fit_resample(X_train, y_train)



ValueError: ignored

# **3 . Model selection**

Hyperparameter tuning

In [64]:
# https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
# https://scikit-learn.org/stable/modules/model_evaluation.html
# https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65
"""
estimators = [
    ('rf', RandomForestClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('svc', SVC())
]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
"""
clf = RandomForestClassifier(min_samples_leaf=10 ,n_estimators= 60, max_features='log2' , max_depth=100, random_state=0, n_jobs=-1)

#https://xgboost.readthedocs.io/en/latest/get_started.html

#RF = RandomForestClassifier()
#parameters = {'min_samples_leaf':[1,10,20], 'n_estimators':[10,50,100,150], 'max_features':('log2', 'sqrt'), 'max_depth':[100, 'None']}

#clf = GridSearchCV(RF, parameters, n_jobs=-1)

In [None]:
#Alternate model 

from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(256, input_shape=(17,), activation='relu'))
model.add(Dense(256, input_shape=(17,), activation='relu'))
model.add(Dense(256, input_shape=(17,), activation='relu'))
model.add(Dense(256, input_shape=(17,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.fit(X_train,y_train)

# **4 . Training**

Train the model

In [65]:
clf.fit(X_train,y_train)



ValueError: ignored

Report important features

In [None]:
col = list(X_train.columns)

In [None]:
from matplotlib import pyplot

importance = clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d %s, Score: %.5f' % (i,col[i],v))
 # plot feature importance
pyplot.figure(figsize=[25.0, 5.0])
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

# **5 . Compute predictions**

Compute predictions

In [None]:
#Be careful, labelencoder can break if there is some categorical data in the test set that was unseen in the train set :https://stackoverflow.com/questions/21057621/sklearn-labelencoder-with-never-seen-before-values
# For the moment it's ok cause test set is a part of our training set so we perform labelencoder before splitting the set, but it could be an issue when working with a different set for the test

X_test[num_features] = sc.transform(X_test[num_features])
#X_test = pca.transform(X_test)
y_pred = clf.predict(X_test)

# **6 . Export results**

In [1]:
submission = pd.DataFrame(data_test['Id'].copy())
submission['Predicted'] = y_pred
print(submission)
from pathlib import Path  
filepath = Path('./out.csv')  
submission.to_csv(filepath, index=False)
files.download(filepath) 

NameError: name 'pd' is not defined

In [None]:
from pathlib import Path  
filepath = Path('./out.csv')  
submission.to_csv(filepath, index=False)
files.download(filepath) 

Determine some metrics to visualize validation performance

In [None]:
f1 = f1_score(y_test, y_pred, average='micro')
print(confusion_matrix(y_test, y_pred))
print(f1)

In [None]:
f1 = f1_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("f1 score is ", f1)
print("recall score is ", recall)
print("accuracy score is ", accuracy)

print(report)