In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from matplotlib.pyplot import hist 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [2]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

In [6]:
!pip install pycaret



In [20]:
train = pd.read_csv("C:/Users/USER/Downloads/train_users_2.csv/train_users_2.csv")
test = pd.read_csv("C:/Users/USER/Downloads/test_users.csv/test_users.csv")

In [21]:
X_train = train.drop(['date_first_booking', 'country_destination', 'id'], axis=1)
X_test = test.drop(['id', 'date_first_booking'], axis=1)

In [22]:
y_des = train['country_destination'].values
X=pd.concat((X_train, X_test), axis=0, ignore_index=True)
X.shape


(275547, 13)

In [23]:
X.fillna(method='pad').head()

Unnamed: 0,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2010-06-28,20090319043255,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [24]:
X.isna().sum()

date_account_created            0
timestamp_first_active          0
gender                          0
age                        116866
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6085
signup_app                      0
first_device_type               0
first_browser                   0
dtype: int64

In [25]:
X.loc[X.age > 90, 'age'] = -1
X.loc[X.age < 13, 'age'] = -1
X['age'].describe()

count    158681.000000
mean         35.268066
std          12.392166
min          -1.000000
25%          28.000000
50%          33.000000
75%          41.000000
max          90.000000
Name: age, dtype: float64

In [26]:
X.loc[X.age.isnull(), 'age']=X.age.mean()

In [27]:
X.isna().sum()

date_account_created          0
timestamp_first_active        0
gender                        0
age                           0
signup_method                 0
signup_flow                   0
language                      0
affiliate_channel             0
affiliate_provider            0
first_affiliate_tracked    6085
signup_app                    0
first_device_type             0
first_browser                 0
dtype: int64

In [28]:
dac = np.vstack(
    X.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))
    ).values
)
X['dac_year'] = dac[:, 0]
X['dac_month'] = dac[:, 1]
X['dac_day'] = dac[:, 2]
X = X.drop(['date_account_created'], axis=1)
X.head()

Unnamed: 0,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,dac_year,dac_month,dac_day
0,20090319043255,-unknown-,35.268066,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28
1,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25
2,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28
3,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,2011,12,5
4,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,9,14


In [30]:
corr_matrix = X.corr()
corr_matrix.style.background_gradient(cmap = 'RdYlGn')

Unnamed: 0,timestamp_first_active,age,signup_flow,dac_year,dac_month,dac_day
timestamp_first_active,1.0,-0.067322,0.232152,0.998953,-0.223167,0.011501
age,-0.067322,1.0,-0.066375,-0.066523,-0.001844,-0.003767
signup_flow,0.232152,-0.066375,1.0,0.229515,0.028751,0.028629
dac_year,0.998953,-0.066523,0.229515,1.0,-0.254234,0.010919
dac_month,-0.223167,-0.001844,0.028751,-0.254234,1.0,-0.011562
dac_day,0.011501,-0.003767,0.028629,0.010919,-0.011562,1.0


In [31]:
oh_features = ['gender', 'signup_method', 'signup_flow', 'language',
                'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app',
                'first_device_type', 'first_browser']

In [32]:
for feature in oh_features:
    X_dummy = pd.get_dummies(X[feature], prefix=feature)
    X = X.drop([feature], axis=1)
    X = pd.concat((X, X_dummy), axis=1)
X.head()

Unnamed: 0,timestamp_first_active,age,dac_year,dac_month,dac_day,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,...,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_UC Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,20090319043255,35.268066,2010,6,28,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20090523174809,38.0,2011,5,25,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20090609231247,56.0,2010,9,28,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,20091031060129,42.0,2011,12,5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20091208061105,41.0,2010,9,14,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Univariate selection 

In [18]:
## apply selectkbest class to extract top 10 features
bestfeatures = SelectKBest(score_func = chi2, k = 10)
fit = bestfeatures.fit(X, y_des)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score'] 
featurescores

ValueError: Found input variables with inconsistent numbers of samples: [275547, 213451]

## Using Hyperparameter tuning to find out the best model to use for prediction 

In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
   
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [138]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv = 5, return_train_score = False)
    clf.fit(dtrain, train_label)
    scores.append({
        'model': model_name,
        'best_score ': clf.best_score_,
        'best_params': clf.best_params_
    })
df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
df

Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 169, in fit
    X, y = self._validate_data(X, y, dtype=np.float64,
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 616, in chec

ValueError: could not convert string to float: '-unknown-'

This is one of the way to find the best fit model to find the accuracy. Error showing as we want to convert sting value to float value