## Model Complete

In [1]:
# Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import FunctionTransformer

In [2]:
# Load Data
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train['country'].value_counts()[-110:-90]

DENMARK           178
NORWAY            151
RWANDA            145
JAPAN             137
AUSTRIA           124
IRELAND           122
OMAN               92
POLAND             84
DRC                82
NEW ZEALAND        82
ISRAEL             75
KOREA              66
RUSSIA             64
FINLAND            62
MALAYSIA           61
CONGO              58
MALAWI             46
PORTUGAL           46
BRAZIL             43
CZECH REPUBLIC     40
Name: country, dtype: int64

In [4]:
# Columns
columns = list(test.columns)

bool_columns = columns[10:17]
bool_columns.append("first_trip_tz")

num_features = [ 'total_female', 'total_male', 'night_mainland','night_zanzibar']

cat_features = ['country','age_group','travel_with','purpose',
              'main_activity','info_source','tour_arrangement']

target = 'cost_category'

scoring= 'f1_weighted'

In [5]:
# FUNCTIONS

def dropId(df):
    df.drop('Tour_ID',axis=1, inplace =True)
    return df

def fillNa(df):
    df['travel_with'].fillna(df['travel_with'].value_counts().index[0],inplace =True)
    df['total_male'].fillna(round(df['total_male'].mean()),inplace =True)
    df['total_female'].fillna(round(df['total_female'].mean()),inplace =True)
    return df
    
def boolConverter(df):
    bool_columns_list = bool_columns
    for col in bool_columns_list:
        df[col] = df[col].apply(lambda x: 1 if x == "Yes" else 0)
    return df

def categoricalFeatureReductor(df):
    for col in df.columns:
        if df[col].dtype == "object":
            counts = df[col].value_counts()
            value_below = counts[counts<50]
            df[col] = df[col].apply(lambda x: "Other" if x in list(value_below.index) else x)
    return df

def outliersReductor(df):
    for col in num_features:
        q_2 = df[col].quantile(0.25)
        q_3 = df[col].quantile(0.75)
        IQR =  q_3 - q_2
        outlier = q_3+IQR*1.5
        df[col] = df[col].apply(lambda x: outlier if x > outlier else x)
    return df

def categoricalConverter(df):
    cat_list = cat_features
    df[cat_list] = df[cat_list].astype('category')
    return df

def oneHot(df):
    cat_list = cat_features
    df_new = pd.get_dummies(df,columns=cat_list)
    return df_new
    
def targetConverter(df):
    if target in df.columns:
        df[target] = df[target].astype('category')
        print(dict(enumerate(df['cost_category'].cat.categories)))
        df['cost_category'] = df['cost_category'].cat.codes
    return df
    
def floatizator(df):
    df = df.astype('float64')
    if target in df.columns:
    	df[target] = df[target].astype('category')
    return df
    
def X_y_generator_and_ovesampling(df):
    X = df.drop(target,axis=1)
    y = df[target]
    class_dic = {5.0:6000,4.0:6000,3.0:6000,2.0:6000,1.0:6000,0.0:6000}
    smote = SMOTE(sampling_strategy=class_dic, random_state= 42)
    X_rebalanced, y_rebalanced = smote.fit_resample(X,y)
    return X_rebalanced,y_rebalanced

In [6]:
# PREPROCESSING
def preprocessing(df):
    p = df.copy()
    if len(p.columns) == len(columns) or len(p.columns) == len(columns)+1 :
        p = dropId(p)
        p = fillNa(p)
        p = boolConverter(p)
        p = categoricalFeatureReductor(p)
        p = outliersReductor(p)
        p = categoricalConverter(p)
        p = oneHot(p)
        p = targetConverter(p)
        p = floatizator(p)
        
    return p

def colChecker(df):
    if len(df.columns) != 87:
        num = 87 - len(df.columns)
        for i in range(num):
            df[f'{i}'] = 0.5
    return df

p = preprocessing(train)
X,y =X_y_generator_and_ovesampling(p)
print(X.shape)
print(X.info())

X.drop(['country_CONGO','country_MALAYSIA','country_FINLAND','country_RUSSIA','country_KOREA','country_ISRAEL',
            'country_DRC','country_NEW ZEALAND','country_POLAND','country_OMAN','country_IRELAND','country_AUSTRIA',
        	'country_JAPAN','country_RWANDA','country_NORWAY'] ,axis=1,inplace= True)

print(X.shape)

# Just in case I need some partition for testing
X_train,X_test, y_train, y_test  = train_test_split(X,y,random_state = 42, test_size= 0.2, stratify=y)

scaler = MinMaxScaler()
X_std = scaler.fit_transform(X)
df_train = pd.DataFrame(X_std)


{0: 'High Cost', 1: 'Higher Cost', 2: 'Highest Cost', 3: 'Low Cost', 4: 'Lower Cost', 5: 'Normal Cost'}
(36000, 87)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36000 entries, 0 to 35999
Data columns (total 87 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   total_female                                 36000 non-null  float64
 1   total_male                                   36000 non-null  float64
 2   package_transport_int                        36000 non-null  float64
 3   package_accomodation                         36000 non-null  float64
 4   package_food                                 36000 non-null  float64
 5   package_transport_tz                         36000 non-null  float64
 6   package_sightseeing                          36000 non-null  float64
 7   package_guided_tour                          36000 non-null  float64
 8   package_insurance             

In [7]:
# MODEL PARAMS
# Random Forest tuning
parameters ={"max_depth": [6, None],
             "max_features": [10, 'sqrt'],
             "min_samples_split": [2,3],
            "n_estimators": [100],
             "ccp_alpha" : [0,0.005]}

rfc = RandomForestClassifier(random_state=42) 

rs_cv = RandomizedSearchCV(rfc,param_distributions=parameters, 
                           n_iter=10, scoring=scoring, cv = 10,random_state=42) 

rs_cv.fit(X_std,y)

print("RandomSearch:", rs_cv.best_params_)
print("Score CV:", rs_cv.best_score_, "\n\n")

RandomSearch: {'n_estimators': 100, 'min_samples_split': 3, 'max_features': 10, 'max_depth': None, 'ccp_alpha': 0}
Score CV: 0.7465263671819578 




In [8]:
# Saving Results
model = rs_cv.best_estimator_
mod_temp = RandomForestClassifier(random_state=42) 
mod_temp_2 = RandomForestClassifier(random_state=10) 

#transformer
transformer = FunctionTransformer(preprocessing)
transformer2 = FunctionTransformer(colChecker)
scaler = MinMaxScaler()

pipe = Pipeline([('transformer', transformer),
                 ("scaler", scaler),
                 ('model', model)
                ])

pipe.fit(df_train, y)

In [11]:
# saving pipe
from joblib import dump


dump(pipe, 'RandomForestModel_Matteo.joblib')

['RandomForestModel_Matteo.joblib']

In [12]:
#Prediction of the test
pred = pipe.predict(test)
print(pred[:200])

[5. 1. 1. 5. 0. 5. 1. 5. 1. 5. 5. 0. 1. 5. 1. 5. 1. 5. 1. 5. 5. 5. 5. 1.
 5. 5. 1. 5. 5. 5. 5. 1. 5. 5. 1. 1. 1. 1. 5. 1. 5. 1. 1. 1. 1. 1. 1. 0.
 0. 5. 1. 1. 1. 5. 0. 1. 1. 5. 1. 1. 1. 0. 1. 5. 1. 1. 0. 1. 1. 1. 5. 1.
 5. 0. 5. 5. 1. 5. 5. 5. 1. 1. 5. 0. 1. 1. 5. 1. 5. 1. 5. 5. 5. 1. 1. 1.
 1. 5. 1. 1. 1. 5. 5. 5. 1. 1. 5. 4. 5. 1. 5. 5. 5. 1. 1. 1. 1. 5. 1. 5.
 1. 5. 1. 5. 1. 1. 5. 1. 1. 1. 1. 5. 1. 5. 1. 1. 1. 5. 1. 5. 1. 1. 1. 1.
 5. 1. 1. 2. 5. 5. 1. 5. 1. 5. 1. 1. 5. 1. 1. 1. 5. 1. 1. 5. 5. 1. 5. 1.
 5. 1. 1. 5. 1. 1. 1. 1. 5. 1. 1. 1. 5. 1. 1. 5. 1. 1. 5. 1. 5. 1. 1. 4.
 1. 5. 5. 5. 1. 1. 5. 5.]


In [13]:
final_prediction = pd.DataFrame(test['Tour_ID'])
final_prediction['prediction'] = pred
final_prediction['prediction'] = final_prediction['prediction'].map({0: 'High Cost', 1: 'Higher Cost', 2: 'Highest Cost', 3: 'Low Cost', 4: 'Lower Cost', 5: 'Normal Cost'})

In [14]:
final_prediction.tail(20)

Unnamed: 0,Tour_ID,prediction
6149,tour_idl39b6q02,Normal Cost
6150,tour_idwhu9n1vb,Normal Cost
6151,tour_id2g7h25oe,Higher Cost
6152,tour_idb8zzhsz6,Higher Cost
6153,tour_ide2uj3gqh,Higher Cost
6154,tour_idv4rpovrd,Normal Cost
6155,tour_id6zyaxxy3,Normal Cost
6156,tour_idf4uv3kyc,Higher Cost
6157,tour_idxtxvnrki,Normal Cost
6158,tour_id2zlugaki,Normal Cost


In [15]:
test[['Tour_ID','country']].tail(20)

Unnamed: 0,Tour_ID,country
6149,tour_idl39b6q02,KENYA
6150,tour_idwhu9n1vb,UNITED KINGDOM
6151,tour_id2g7h25oe,UNITED KINGDOM
6152,tour_idb8zzhsz6,ITALY
6153,tour_ide2uj3gqh,UNITED KINGDOM
6154,tour_idv4rpovrd,KENYA
6155,tour_id6zyaxxy3,SOUTH AFRICA
6156,tour_idf4uv3kyc,UNITED STATES OF AMERICA
6157,tour_idxtxvnrki,ZIMBABWE
6158,tour_id2zlugaki,ZIMBABWE


In [17]:
final_prediction.to_csv( 'final_submission.csv' ,index=False,sep=";")