### MODEL SELECTION

In [1]:
# Libraries 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing the dataset
df = pd.read_csv("Train.csv")

In [3]:
# Columns
columns = list(df.columns)

bool_columns = columns[10:17]
bool_columns.append("first_trip_tz")

num_features = [ 'total_female', 'total_male', 'night_mainland','night_zanzibar']

cat_features = ['country','age_group','travel_with','purpose','main_activity','info_source','tour_arrangement']

target = 'cost_category'

In [4]:
# Dropping useless columns

df.drop('Tour_ID',axis=1, inplace =True)

In [5]:
# NAN imputer

df['travel_with'].fillna(df['travel_with'].value_counts().index[0],inplace =True)
df['total_female'].fillna(round(df['total_female'].mean()),inplace =True)
df['total_male'].fillna(round(df['total_male'].mean()),inplace =True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18506 entries, 0 to 18505
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                18506 non-null  object 
 1   age_group              18506 non-null  object 
 2   travel_with            18506 non-null  object 
 3   total_female           18506 non-null  float64
 4   total_male             18506 non-null  float64
 5   purpose                18506 non-null  object 
 6   main_activity          18506 non-null  object 
 7   info_source            18506 non-null  object 
 8   tour_arrangement       18506 non-null  object 
 9   package_transport_int  18506 non-null  object 
 10  package_accomodation   18506 non-null  object 
 11  package_food           18506 non-null  object 
 12  package_transport_tz   18506 non-null  object 
 13  package_sightseeing    18506 non-null  object 
 14  package_guided_tour    18506 non-null  object 
 15  pa

In [6]:
# Boolean Handling, transforming boolean to integer 

for col in bool_columns:
	df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18506 entries, 0 to 18505
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                18506 non-null  object 
 1   age_group              18506 non-null  object 
 2   travel_with            18506 non-null  object 
 3   total_female           18506 non-null  float64
 4   total_male             18506 non-null  float64
 5   purpose                18506 non-null  object 
 6   main_activity          18506 non-null  object 
 7   info_source            18506 non-null  object 
 8   tour_arrangement       18506 non-null  object 
 9   package_transport_int  18506 non-null  int64  
 10  package_accomodation   18506 non-null  int64  
 11  package_food           18506 non-null  int64  
 12  package_transport_tz   18506 non-null  int64  
 13  package_sightseeing    18506 non-null  int64  
 14  package_guided_tour    18506 non-null  int64  
 15  pa

In [7]:
# Reduction of categorical columns that have some category with too few -HEAVY PREPROCESSING-
for col in df.columns:
    if df[col].dtype == "object":
        counts = df[col].value_counts()
        print(counts)
        value_below = counts[counts<50]
        df[col] = df[col].apply(lambda x: "Other" if x in list(value_below.index) else x)
        print(df[col].value_counts())



UNITED STATES OF AMERICA    2846
UNITED KINGDOM              2120
ITALY                       1625
FRANCE                      1064
GERMANY                      969
                            ... 
ERITREA                        1
NIGER                          1
ESTONIA                        1
CAPE VERDE                     1
JORDAN                         1
Name: country, Length: 131, dtype: int64
UNITED STATES OF AMERICA    2846
UNITED KINGDOM              2120
ITALY                       1625
FRANCE                      1064
Other                        970
GERMANY                      969
SOUTH AFRICA                 811
KENYA                        781
ZAMBIA                       715
AUSTRALIA                    704
CANADA                       571
ZIMBABWE                     570
UGANDA                       527
SPAIN                        470
INDIA                        408
NETHERLANDS                  381
SWIZERLAND                   278
BELGIUM                      244
SW

In [8]:
# Outliers processing. - HEAVY PREPROCESSING -
# All outliers are brought back to the value of 75 quantile + 1.5 * (Interquartile Range).
# That's the standard value over which an element is defined as an outlier.

for col in num_features:
    print(col)
    print(df[col].max())
    q_2 = df[col].quantile(0.25)
    q_3 = df[col].quantile(0.75)
    IQR =  q_3 - q_2
    outlier = q_3+IQR*1.5
    df[col] = df[col].apply(lambda x: outlier if x > outlier else x)
    print(outlier)
    print(df[col].max())

total_female
49.0
2.5
2.5
total_male
58.0
1.0
1.0
night_mainland
365
23.0
23.0
night_zanzibar
240
10.0
10.0


In [9]:
# Categorical features transform to type: category 
df[cat_features] = df[cat_features].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18506 entries, 0 to 18505
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   country                18506 non-null  category
 1   age_group              18506 non-null  category
 2   travel_with            18506 non-null  category
 3   total_female           18506 non-null  float64 
 4   total_male             18506 non-null  float64 
 5   purpose                18506 non-null  category
 6   main_activity          18506 non-null  category
 7   info_source            18506 non-null  category
 8   tour_arrangement       18506 non-null  category
 9   package_transport_int  18506 non-null  int64   
 10  package_accomodation   18506 non-null  int64   
 11  package_food           18506 non-null  int64   
 12  package_transport_tz   18506 non-null  int64   
 13  package_sightseeing    18506 non-null  int64   
 14  package_guided_tour    18506 non-null 

In [10]:
# Label Encoding 
# for col in cat_features:
#	df[col]=df[col].cat.codes

# ONE HOT
df =pd.get_dummies(df,columns=cat_features)

# LABEL ENCODING OF THE TARGET FEATURE
df[target] = df[target].astype('category')
dic = dict(enumerate(df['cost_category'].cat.categories))
print(dic)
print(df[target][:5])

df['cost_category'] = df['cost_category'].cat.codes

df.head()

{0: 'High Cost', 1: 'Higher Cost', 2: 'Highest Cost', 3: 'Low Cost', 4: 'Lower Cost', 5: 'Normal Cost'}
0      High Cost
1      High Cost
2    Higher Cost
3     Lower Cost
4    Higher Cost
Name: cost_category, dtype: category
Categories (6, object): ['High Cost', 'Higher Cost', 'Highest Cost', 'Low Cost', 'Lower Cost', 'Normal Cost']


Unnamed: 0,total_female,total_male,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,...,"info_source_Friends, relatives",info_source_Inflight magazines,"info_source_Newspaper, magazines, brochures",info_source_Others,"info_source_Radio, TV, Web",info_source_Tanzania Mission Abroad,info_source_Trade fair,"info_source_Travel agent, tour operator",tour_arrangement_Independent,tour_arrangement_Package Tour
0,0.0,1.0,1,1,1,1,0,0,0,0.0,...,1,0,0,0,0,0,0,0,0,1
1,1.0,1.0,1,1,1,1,0,0,0,0.0,...,0,0,0,0,0,0,0,1,0,1
2,1.0,1.0,1,1,1,1,1,1,0,6.0,...,0,0,0,0,0,0,0,1,0,1
3,2.5,1.0,0,0,0,0,0,0,0,3.0,...,0,0,0,0,1,0,0,0,1,0
4,0.0,1.0,1,1,1,1,0,1,1,7.0,...,0,0,0,0,0,0,0,1,0,1


In [11]:
# One Hot info 
print(list(df.columns))
print("num columns: ", len(df.columns))

['total_female', 'total_male', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'night_mainland', 'night_zanzibar', 'first_trip_tz', 'cost_category', 'country_AUSTRALIA', 'country_AUSTRIA', 'country_BELGIUM', 'country_BURUNDI', 'country_CANADA', 'country_CHINA', 'country_CONGO', 'country_DENMARK', 'country_DRC', 'country_FINLAND', 'country_FRANCE', 'country_GERMANY', 'country_INDIA', 'country_IRELAND', 'country_ISRAEL', 'country_ITALY', 'country_JAPAN', 'country_KENYA', 'country_KOREA', 'country_MALAYSIA', 'country_NETHERLANDS', 'country_NEW ZEALAND', 'country_NORWAY', 'country_OMAN', 'country_Other', 'country_POLAND', 'country_RUSSIA', 'country_RWANDA', 'country_SOUTH AFRICA', 'country_SPAIN', 'country_SWEDEN', 'country_SWIZERLAND', 'country_UAE', 'country_UGANDA', 'country_UNITED KINGDOM', 'country_UNITED STATES OF AMERICA', 'country_ZAMBIA', 'country_ZIMBABWE', 'age_group_18-24

In [12]:
# Floatization 
df = df.astype('float64')
df[target] = df[target].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18506 entries, 0 to 18505
Data columns (total 88 columns):
 #   Column                                       Non-Null Count  Dtype   
---  ------                                       --------------  -----   
 0   total_female                                 18506 non-null  float64 
 1   total_male                                   18506 non-null  float64 
 2   package_transport_int                        18506 non-null  float64 
 3   package_accomodation                         18506 non-null  float64 
 4   package_food                                 18506 non-null  float64 
 5   package_transport_tz                         18506 non-null  float64 
 6   package_sightseeing                          18506 non-null  float64 
 7   package_guided_tour                          18506 non-null  float64 
 8   package_insurance                            18506 non-null  float64 
 9   night_mainland                               18506 non-null  

In [13]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.1.0
  Downloading scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.2/31.2 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.17.3
  Downloading numpy-1.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m

In [14]:
# X e y
X = df.drop(target,axis=1)
y = df[target]
print("y", len(y))
print("X", X.shape)

y 18506
X (18506, 87)


In [15]:
# Balancing Class -HEAVY PREPROCESSING-
from imblearn.over_sampling import SMOTE

class_dic = {5.0:6000,4.0:6000,3.0:6000,2.0:6000,1.0:6000,0.0:6000,}

smote = SMOTE(sampling_strategy=class_dic, random_state= 42)

print(y.value_counts())

X_rebalanced, y_rebalanced = smote.fit_resample(X,y)

print(y_rebalanced.value_counts())

print("y", len(y_rebalanced))
print("X", X_rebalanced.shape)

5.0    5471
1.0    4865
0.0    3678
4.0    2567
3.0    1566
2.0     359
Name: cost_category, dtype: int64
0.0    6000
1.0    6000
2.0    6000
3.0    6000
4.0    6000
5.0    6000
Name: cost_category, dtype: int64
y 36000
X (36000, 87)


I've decided to use a **SMOTE** over-sampling method insted of a RandomSampling because the RandomSampler duplicates the row of the minority classes, that could lead to overfitting, while **SMOTE**, Synthetic Minority Oversampling Technique, generate samples in the vector space near the values relative to class.

In [15]:
# Libraries 

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [38]:
# Standardization and Normalization 

STDscaler = StandardScaler()
NORMscaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X_rebalanced, y_rebalanced, test_size = 0.2, stratify = y_rebalanced, random_state = 42) 

X_train_std = STDscaler.fit_transform(X_train)
X_test_std = STDscaler.transform(X_test)

X_train_norm = NORMscaler.fit_transform(X_train)
X_test_norm = NORMscaler.transform(X_test)

In [27]:

# Random Forest Classifier


rfr = RandomForestClassifier(random_state = 42)
rfr.fit(X_rebalanced,y_rebalanced)
pred = rfr.predict(X_test_std)

print("Standardized: " , f1_score(y_test,pred,average='weighted'))

Standardized:  0.5176544988417061


In [41]:
import xgboost as xgb
rfr = xgb.XGBClassifier(objective='reg:logistic',random_state = 42)
rfr.fit(X_rebalanced,y_rebalanced)
pred = rfr.predict(X_test_std)

print("Standardized: " , f1_score(y_test,pred,average='weighted'))

Standardized:  0.5425848121836108


In [42]:
rfr = LogisticRegression()
rfr.fit(X_rebalanced,y_rebalanced)
pred = rfr.predict(X_test_std)

print("Standardized: " , f1_score(y_test,pred,average='weighted'))

Standardized:  0.48481090160874707


# MODEL SEARCH
_____
We'll start with some simple classifiers and see who performs best.

In [25]:
# Modeling libraries

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, KFold 
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score, precision_score

# Scoring Metric
scoring = 'f1_weighted'

# Data List
data = [[X_train,X_test],[X_train_std,X_test_std],[X_train_norm,X_test_norm]]

### Simple Models
---
Now I'll try some configuration with raw models with defalut parameters

In [45]:
# SVC

predictions = []
for group in data:
    svc = SVC(random_state=42)
    svc.fit(group[0],y_train)
    pred = svc.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.5563431066238144
Standardized:  0.651558972747645
Normalized:  0.6502778023556652


In [46]:
# Logistic Regression

predictions = []
for group in data:
    lr = LogisticRegression()
    lr.fit(group[0],y_train)
    pred = lr.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))


Not preprocessed:  0.5307256623307766
Standardized:  0.5411693214024543
Normalized:  0.5398052995307566


In [47]:
# Decision Tree

predictions = []
for group in data:
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(group[0],y_train)
    pred = dt.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.6516261074167563
Standardized:  0.6513083383912947
Normalized:  0.6514584350046844


In [48]:
# KNN

predictions = []
for group in data:
    knn = KNeighborsClassifier()
    knn.fit(group[0],y_train)
    pred = knn.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.6746579146682149
Standardized:  0.649685505864911
Normalized:  0.6477530754713094


### Ensamble
---
Now let's search with some ensamble models

In [49]:
# Random Forest Classifier

predictions = []
for group in data:
    rfr = RandomForestClassifier(random_state = 42)
    rfr.fit(group[0],y_train)
    pred = rfr.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.7501837047750833
Standardized:  0.7504521496653016
Normalized:  0.7501615517087877


In [50]:
#Gradient Boosting Classifier
predictions = []
for group in data:
    gbc = GradientBoostingClassifier(random_state = 42)
    gbc.fit(group[0],y_train)
    pred = gbc.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.6482954467272265
Standardized:  0.6482954467272265
Normalized:  0.6482954467272265


In [51]:
# Voting Classifier
svc = SVC(random_state=42 )
lr = LogisticRegression()
knn = KNeighborsClassifier()
rfr = RandomForestClassifier(random_state=42 )
predictions = []

for group in data:
    vc = VotingClassifier(estimators = [('svc', svc), ('knn', knn),('rfr', rfr) ,('lr',lr)])
    vc.fit(group[0],y_train)
    pred = vc.predict(group[1])
    predictions.append(pred)

print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.6662369207908332
Standardized:  0.6828879751140758
Normalized:  0.6879327230510243


In [52]:
# XGB
import xgboost as xgb

predictions = []

xgb_model = xgb.XGBClassifier(objective='reg:logistic',random_state = 42)
xgb_model.fit(X_train.values,y_train)
pred = xgb_model.predict(X_test.values)
predictions.append(pred)

xgb_model = xgb.XGBClassifier(objective='reg:logistic',random_state = 42)
xgb_model.fit(X_train_std,y_train)
pred = xgb_model.predict(X_test_std)
predictions.append(pred)

xgb_model = xgb.XGBClassifier(objective='reg:logistic',random_state = 42)
xgb_model.fit(X_train_norm,y_train)
pred = xgb_model.predict(X_test_norm)
predictions.append(pred)


print("Not preprocessed: " , f1_score(y_test,predictions[0],average='weighted'))
print("Standardized: " , f1_score(y_test,predictions[1],average='weighted'))
print("Normalized: " , f1_score(y_test,predictions[2],average='weighted'))

Not preprocessed:  0.7031018970546838
Standardized:  0.7030946245657286
Normalized:  0.7031018970546838


## HYPERPARAMETER TUNING
---
The best models were **XGBboost** with an f1 of **0.703**, **Random Forest** with an f1 of **0.7456**. Both of them seam to not be gratly affected by any normalization of the data but I'll use the **Standardized data** since it has performed slightly better. For purely theoretical reason I'll not use the unscaled data.

This part is computationaly intensive

In [19]:
#Libraries
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [20]:
# Random Forest tuning
parameters ={"max_depth": [6, None],
             "max_features": [10,15, 'sqrt'],
             "min_samples_split": [2,3],
            "n_estimators": [20,100,200],
             "ccp_alpha" : [0,0.005]}

rfc = RandomForestClassifier(random_state=42) 

rs_cv = RandomizedSearchCV(rfc,param_distributions=parameters, 
                           n_iter=10, scoring=scoring, cv = 10,random_state=42) 

rs_cv.fit(X_train_norm,y_train)
pred = rs_cv.best_estimator_.predict(X_test_norm)

print("RandomSearch:", rs_cv.best_params_)
print("Score CV:", rs_cv.best_score_, "\n\n")
print("Score test: " , f1_score(y_test,pred,average='weighted'))


RandomSearch: {'n_estimators': 100, 'min_samples_split': 2, 'max_features': 10, 'max_depth': None, 'ccp_alpha': 0}
Score CV: 0.7424377391811986 


Score test:  0.7499841017780947


In [21]:
parameters ={"max_depth": [6, None],
             "max_features": [5, 'sqrt'],
             "min_samples_split": [3,6],
            "n_estimators": [200],
             "ccp_alpha" : [0,0.005]}

gs_cv = GridSearchCV(rfc, param_grid=parameters, scoring=scoring, cv = 10) 
gs_cv.fit(X_train_norm,y_train)
pred = gs_cv.best_estimator_.predict(X_test_norm)

print("GridSearch:", gs_cv.best_params_)
print("Score CV:", gs_cv.best_score_)
print("Score test: " , f1_score(y_test,pred,average='weighted'))

In [26]:
# XGB
import xgboost as xgb

# XGB tuning
# Random Forest tuning
parameters ={"max_depth": [6, None],
             "max_features": [2, 10,15, 'sqrt'],
             "min_samples_split": [1,2,3],
            "n_estimators": [100],
             "ccp_alpha" : [0,0.005]}

xgb_model = xgb.XGBClassifier(objective='reg:logistic',random_state=42) 

rs_cv = RandomizedSearchCV(xgb_model,  param_distributions=parameters,
                           n_iter=10, scoring=scoring, cv = 10,random_state=42) 

rs_cv.fit(X_train_norm,y_train)
pred = rs_cv.best_estimator_.predict(X_test_norm)

print("RandomSearch:", rs_cv.best_params_)
print("Score CV:", rs_cv.best_score_, "\n\n")
print("Score test: " , f1_score(y_test,pred,average='weighted'))

Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongl

In [27]:
# GridsearchCV

parameters ={"max_depth": [6, None],
             "max_features": [2, 'sqrt'],
             "min_samples_split": [3,6],
            "n_estimators": [100],
             "ccp_alpha" : [0.005,0.01]}

xgb_model = xgb.XGBClassifier(objective='reg:logistic',random_state=42) 

gs_cv = GridSearchCV(xgb_model, param_grid=parameters, scoring=scoring, cv = 10) 
gs_cv.fit(X_train_norm,y_train)
pred = gs_cv.best_estimator_.predict(X_test_norm)

Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "ccp_alpha", "max_features", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongl

In [28]:
#GridsearchCV
print("GridSearch:", gs_cv.best_params_)
print("Score CV:", gs_cv.best_score_)
print("Score test: " , f1_score(y_test,pred,average='weighted'))

GridSearch: {'ccp_alpha': 0.005, 'max_depth': 6, 'max_features': 2, 'min_samples_split': 3, 'n_estimators': 100}
Score CV: 0.7012269812580703
Score test:  0.7031018970546838


## NEURAL NETWORKS

In [28]:
# Keras Libraries
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout

from tensorflow.keras.callbacks import EarlyStopping
import keras.backend as K
import tensorflow as tf

2022-10-08 14:25:43.642437: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-08 14:25:43.642465: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [29]:
# Definition of the metric funcion f1_weighted

def f1_weighted(true, pred):
    
     predLabels = K.argmax(pred, axis=-1)
     pred = K.one_hot(predLabels, 6) 
    
     ground_positives = K.sum(true, axis=0) + K.epsilon()      
     pred_positives = K.sum(pred, axis=0) + K.epsilon()        
     true_positives = K.sum(true * pred, axis=0) + K.epsilon()  
       
     precision = true_positives / pred_positives 
     recall = true_positives / ground_positives
         
     f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
         
     weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
     weighted_f1 = K.sum(weighted_f1)
     
     return  weighted_f1 

In [33]:
# Preparing the data for Keras
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

In [34]:
# Neural Network
model = Sequential() 

model.add(Dense(2280, input_shape=(87,), activation='relu')) 

model.add(Dropout(0.2))

model.add(Dense(2280, activation='relu')) 

model.add(Dropout(0.2))

model.add(Dense(680, activation='relu')) 

model.add(Dropout(0.2))

model.add(Dense(6, activation='softmax')) 

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[f1_weighted]) 

In [35]:
# Train your model for 30 epochs 
model.fit(X_rebalanced,y_rebalanced, validation_split=0.1, batch_size= 300, epochs = 50) 

loss, f1 = model.evaluate(X_test_std, y_test)

print('f1_score:', f1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
f1_score: 0.46648985147476196


It's **f_1 score** it's the best that we had from our model at **0.763**. 
We have decided to use the f1 score because it's a good balance between precision and recall, and since the classes of the target feature were not balanced, using the accuracy as a metric would have not been the best choice.

## TEST ANALYSIS
---
Breif analysis, in order to see if we have new problem to address in the test data

In [92]:
# Importing
TEST = pd.read_csv("Test.csv")

TEST.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Tour_ID                6169 non-null   object 
 1   country                6169 non-null   object 
 2   age_group              6169 non-null   object 
 3   travel_with            5808 non-null   object 
 4   total_female           6167 non-null   float64
 5   total_male             6168 non-null   float64
 6   purpose                6169 non-null   object 
 7   main_activity          6169 non-null   object 
 8   info_source            6169 non-null   object 
 9   tour_arrangement       6169 non-null   object 
 10  package_transport_int  6169 non-null   object 
 11  package_accomodation   6169 non-null   object 
 12  package_food           6169 non-null   object 
 13  package_transport_tz   6169 non-null   object 
 14  package_sightseeing    6169 non-null   object 
 15  pack

In [23]:
TEST.isna().sum()

Tour_ID                    0
country                    0
age_group                  0
travel_with              361
total_female               2
total_male                 1
purpose                    0
main_activity              0
info_source                0
tour_arrangement           0
package_transport_int      0
package_accomodation       0
package_food               0
package_transport_tz       0
package_sightseeing        0
package_guided_tour        0
package_insurance          0
night_mainland             0
night_zanzibar             0
first_trip_tz              0
dtype: int64

In [22]:
for col in TEST.columns:
    print(col)
    print(TEST[col].value_counts(), "\n\n")

Tour_ID
tour_idynufedne    1
tour_id09xaivry    1
tour_idfmcgad1i    1
tour_id1hl9w50y    1
tour_idgdr5olb9    1
                  ..
tour_id6ifmwmz2    1
tour_iddvz0si9i    1
tour_idrc13y6cv    1
tour_idd277qz6z    1
tour_id8fkkwytb    1
Name: Tour_ID, Length: 6169, dtype: int64 


country
UNITED STATES OF AMERICA    919
UNITED KINGDOM              699
ITALY                       546
FRANCE                      346
GERMANY                     342
                           ... 
PERU                          1
BENIN                         1
KAZAKHSTAN                    1
CAMEROON                      1
SENEGAL                       1
Name: country, Length: 118, dtype: int64 


age_group
25-44    3021
45-64    1907
18-24     729
65+       480
<18        32
Name: age_group, dtype: int64 


travel_with
Alone                           2528
With Spouse                     1593
With Other Friends/Relatives    1051
With Spouse and Children         446
With Children                    190
Na