## Data analysis
### The search sessions with a missing country either come from a country that is completely missing from the data, or from one of the countries that are logged in the data. Determine which country it is the most likely to be.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib

### Load dataframe

In [2]:
url = 'city_search.json'
df = pd.read_json(url, orient='columns')

In [3]:
df.head(2)

Unnamed: 0,cities,session_id,unix_timestam p,unix_timestamp,user
0,"[New York NY, Newark NJ]",[X061RFWB06K9V],,[1442503708],"[[{'country': 'UK', 'joining_date': '2015-03-2..."
1,"[New York NY, Jersey City NJ, Philadelphia PA]",[5AZ2X2A9BHH5U],,[1441353991],"[[{'country': 'DE', 'joining_date': '2015-03-2..."


#### Merge defective timestamp column

In [4]:
df['timestamp'] = df['unix_timestam p'].combine_first(df['unix_timestamp'])
df.head(2)

Unnamed: 0,cities,session_id,unix_timestam p,unix_timestamp,user,timestamp
0,"[New York NY, Newark NJ]",[X061RFWB06K9V],,[1442503708],"[[{'country': 'UK', 'joining_date': '2015-03-2...",[1442503708]
1,"[New York NY, Jersey City NJ, Philadelphia PA]",[5AZ2X2A9BHH5U],,[1441353991],"[[{'country': 'DE', 'joining_date': '2015-03-2...",[1441353991]


### Data preparation

In [5]:
colNames = ['cities', 'user']
def extract_country(user):
    return  user[0][0]['country']

flattened_data_frame = pd.DataFrame(columns = ['country', 'cities', 'user'])
flattened_data_frame['cities'] = df['cities'].apply(lambda x: ', '.join(x))

flattened_data_frame['timestamp'] = df['timestamp'].apply(lambda x:  x[0])
flattened_data_frame['session_id'] = df.apply(lambda x: x['session_id'][0], axis=1)
flattened_data_frame['user'] = df['user'].apply(lambda x:  x[0][0]['user_id'])
flattened_data_frame['country'] = df.apply(lambda x: extract_country(x[colNames[1]]), axis=1)

In [6]:
flattened_data_frame.head(3)

Unnamed: 0,country,cities,user,timestamp,session_id
0,UK,"New York NY, Newark NJ",2024,1442503708,X061RFWB06K9V
1,DE,"New York NY, Jersey City NJ, Philadelphia PA",2853,1441353991,5AZ2X2A9BHH5U
2,UK,San Antonio TX,10958,1440843490,SHTB4IYAX4PX6


In [7]:
len(flattened_data_frame)

20022

In [8]:
# Split Pandas Dataframe 'Cities' column containing lists into multiple rows, duplicating the other column's values.

#Following function is taken from: https://gist.github.com/jlln/338b4b0b55bd6984f883 [[[
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s.lstrip()
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df
# ]]]

cities = splitDataFrameList(flattened_data_frame, 'cities', ',')
# rename the column
cities.rename(columns={'cities':'city'}, inplace=True)

### Explore data

In [9]:
cities.head(3)

Unnamed: 0,city,country,session_id,timestamp,user
0,New York NY,UK,X061RFWB06K9V,1442503708,2024
1,Newark NJ,UK,X061RFWB06K9V,1442503708,2024
2,New York NY,DE,5AZ2X2A9BHH5U,1441353991,2853


In [10]:
np.save('city_frequency', (cities['city'], cities['city'].value_counts()))

### Convert city column to a category, then use those category values for label encoding:

In [11]:
cities['city'] = cities['city'].astype('category')
cities["city"] = cities["city"].cat.codes

In [12]:
cities.head(3)

Unnamed: 0,city,country,session_id,timestamp,user
0,50,UK,X061RFWB06K9V,1442503708,2024
1,51,UK,X061RFWB06K9V,1442503708,2024
2,50,DE,5AZ2X2A9BHH5U,1441353991,2853


In [13]:
len(cities)

33016

In [14]:
data_to_train = cities[cities['country'] != '']
data_to_predict = cities[cities['country'] == '']

In [15]:
len(data_to_train)

28397

In [16]:
len(data_to_predict)

4619

# Create a country classifier

### Load and split the data

In [17]:
X = data_to_train[['city', 'user']]
y = data_to_train['country']
X_predict = data_to_predict.loc[:, ['city','user']]

np.save('X', X)
np.save('y', y)
np.save('X_predict', X_predict)

X = StandardScaler().fit_transform(X)
X_predict = StandardScaler().fit_transform(X_predict)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Choose a model

## First approach: Support Vector Machine Classifier

In [18]:
# Fit Support Vector Machine Classifier
# from the docstring:
'''C-Support Vector Classification.

The implementation is based on libsvm. The fit time complexity
is more than quadratic with the number of samples which makes it hard
to scale to dataset with more than a couple of 10000 samples.

The multiclass support is handled according to a one-vs-one scheme.
.
.
.
decision_function_shape : 'ovo', 'ovr'
    Whether to return a one-vs-rest ('ovr') decision function of shape
    (n_samples, n_classes) as all other classifiers, or the original
    one-vs-one ('ovo') decision function of libsvm which has shape
    (n_samples, n_classes * (n_classes - 1) / 2).
'''
clf = svm.SVC(decision_function_shape='ovr')

#### Fit on the train data

In [19]:
clf.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Inspect clasiffier classes

In [20]:
clf.classes_

array(['DE', 'ES', 'FR', 'IT', 'UK', 'US'], dtype=object)

### Evaluate

In [21]:
y_score = clf.decision_function(X_test)

In [22]:
y_test_predicted = clf.predict(X_test)

In [23]:
 print ("Accuracy for test dataset: %s" % clf.score(X_test, y_test))

Accuracy for test dataset: 0.233098591549


### Parameter Tuning

#### Fit the whole datase (except the records with defective country)

In [24]:
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Prediction

In [25]:
Y_predict = clf.predict(X_predict)

In [26]:
unique, counts = np.unique(Y_predict, return_counts=True)
print (np.asarray((unique, counts)).T)

[['DE' 1061]
 ['UK' 782]
 ['US' 2776]]


### For this classifier, from all predictions(4619), 60% of predictions(2776) are 'US'

#### Save classifier

In [29]:
joblib.dump(clf, 'classifier_svc.pkl') 

['classifier_svc.pkl']

## Second approach: KNeighborsClassifier


In [30]:
# Fit KNeighborsClassifier
# from the docstring:
'''Classifier implementing the k-nearest neighbors vote.

Parameters
----------
n_neighbors : int, optional (default = 5)
    Number of neighbors to use by default for :meth:`kneighbors` queries.
    
Regarding the Nearest Neighbors algorithms, if it is found that two
neighbors, neighbor `k+1` and `k`, have identical distances
but different labels, the results will depend on the ordering of the
training data.    
'''
clf_kn = KNeighborsClassifier(7)



# flattened_data_frame['cities'] = flattened_data_frame['cities'].astype('category')
# flattened_data_frame['session_id'] = flattened_data_frame['session_id'].astype('category')
# flattened_data_frame['timestamp'] = flattened_data_frame['timestamp'].astype('category')
# flattened_data_frame["cities"] = flattened_data_frame["cities"].cat.codes
# flattened_data_frame["session_id"] = flattened_data_frame["session_id"].cat.codes
# flattened_data_frame["timestamp"] = flattened_data_frame["timestamp"].cat.codes


# data_to_train = flattened_data_frame[flattened_data_frame['country'] != '']
# data_to_predict = flattened_data_frame[flattened_data_frame['country'] == '']
# X = data_to_train[['cities', 'user', 'timestamp', 'session_id']]
# y = data_to_train['country']

# X = StandardScaler().fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

#### Fit on the train data

In [31]:
data_to_train.head(2)

Unnamed: 0,city,country,session_id,timestamp,user
0,50,UK,X061RFWB06K9V,1442503708,2024
1,51,UK,X061RFWB06K9V,1442503708,2024


In [32]:
clf_kn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

#### Inspect clasiffier classes

In [33]:
clf_kn.classes_

array(['DE', 'ES', 'FR', 'IT', 'UK', 'US'], dtype=object)

### Evaluate

In [34]:
score = clf_kn.score(X_test, y_test)
score


0.2272887323943662

### Prediction

In [35]:
Y_predict = clf.predict(X_predict)

In [36]:
unique, counts = np.unique(Y_predict, return_counts=True)
print (np.asarray((unique, counts)).T)

[['DE' 1061]
 ['UK' 782]
 ['US' 2776]]


### For this classifier, from all predictions(4619), 72% of predictions(3356) are 'US'

#### Save classifier

In [37]:
joblib.dump(clf_kn, 'classifier_knn.pkl') 

['classifier_knn.pkl']

## Second approach: OneVsRestClassifier from sklearn.multiclass

In [38]:
# Fit OneVsRestClassifier Classifier
# from the docstring:
'''One-vs-the-rest (OvR) multiclass/multilabel strategy

Also known as one-vs-all, this strategy consists in fitting one classifier
per class. For each classifier, the class is fitted against all the other
classes. In addition to its computational efficiency (only `n_classes`
classifiers are needed), one advantage of this approach is its
interpretability. Since each class is represented by one and one classifier
only, it is possible to gain knowledge about the class by inspecting its
corresponding classifier. This is the most commonly used strategy for
multiclass classification and is a fair default choice.

This strategy can also be used for multilabel learning, where a classifier
is used to predict multiple labels for instance, by fitting on a 2-d matrix
in which cell [i, j] is 1 if sample i has label j and 0 otherwise.

In the multilabel learning literature, OvR is also known as the binary
relevance method.
'''
clf = OneVsRestClassifier(svm.SVC(decision_function_shape='ovr'))

#### Fit on the train data

In [39]:
clf.fit(X_train, y_train) 

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

#### Inspect clasiffier classes

In [40]:
clf.classes_

array(['DE', 'ES', 'FR', 'IT', 'UK', 'US'],
      dtype='<U2')

### Evaluate

In [41]:
y_score = clf.decision_function(X_test)

### Parameter Tuning

#### Fit the whole datase (except the records with defective country)

In [None]:
clf.fit(X, y)

In [None]:
clf.score(X, y)

### Prediction

In [42]:
Y_predict = clf.predict(X_predict)

In [43]:
unique, counts = np.unique(Y_predict, return_counts=True)
print (np.asarray((unique, counts)).T)

[['DE' '886']
 ['ES' '896']
 ['FR' '101']
 ['IT' '626']
 ['UK' '980']
 ['US' '1130']]


#### Save classifier

In [44]:
joblib.dump(clf, 'classifier_ovr_multi.pkl') 

['classifier_ovr_multi.pkl']