# Загрузка данных

In [90]:
! pip install pydotplus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [91]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import seaborn

In [92]:
train = pd.read_csv('/content/train.csv', sep=';')
test = pd.read_csv('/content/test.csv', sep=';')
# submission = pd.read_csv('/kaggle/input/hw-multiclass-classification/sub_baseline.csv')

In [93]:
'train: ', train.shape, 'test: ', test.shape

('train: ', (39481, 15), 'test: ', (9871, 14))

Результат нужно сохранить в **csv файл с двумя колонками**: 

listing_id - берете из датасета

interest_level - проставляете своим алгоритмом

Разделитель **запятая**, при сохранении с помощью result.to_csv() - не забывайте указывать index=None.

In [94]:
# submission.head()

## Анализ данных
## 1. Предобработка и генерация признаков

Для выполнения задания попробуйте придумать признаки на основе имеющихся данных, для работы с текстовыми полями вам может пригодится LabelEncoder/OneHotEncoder, re.sub(),re.findall() 

Ниже представлены имеющиеся у нас признаки. Можно попробовать сгенерировать из них новые. Например, можно сделать признак Стоимость за комнату, признаки на основе значений в features, выделить и закодировать адрес, так же можно попробовать оценить "успешность" менеджера, предположив, что чем больше объявлений у данного manager_id - тем он круче.

In [95]:
train.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level'],
      dtype='object')

In [96]:
low = train[train.interest_level=='low'].sample(2900)
middle = train[train.interest_level=='medium'].sample(2900)
high = train[train.interest_level=='high'].sample(2900)

train = pd.concat([low,middle,high])

train.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level'],
      dtype='object')

In [97]:
# Поработаем с ценой
train["price"].describe().round(0)

count       8700.0
mean        3399.0
std        11643.0
min          695.0
25%         2225.0
50%         2856.0
75%         3775.0
max      1070000.0
Name: price, dtype: float64

In [98]:
def price_lvl(price):
    if price>4100:
        return 2
    elif price>3150:
        return 1
    else:
        return 0

In [99]:
train["price_lvl"] = train.price.apply(price_lvl)
train["features"]

11046    ['Dining Room', 'Elevator', 'Pre-War', 'Laundr...
13493    ['Doorman', 'Fitness Center', 'Pool', 'Elevato...
25642                                          ['Doorman']
37226                                                   []
33484    ['Swimming Pool', 'Dining Room', 'Doorman', 'E...
                               ...                        
13023    ['Dining Room', 'Doorman', 'Elevator', 'Laundr...
14460    ['Roof Deck', 'Doorman', 'Elevator', 'Fitness ...
12851                                  ['Hardwood Floors']
31661    ['Cats Allowed', 'Dogs Allowed', 'No Fee', 'Re...
36781    ['Roof Deck', 'Dining Room', 'Balcony', 'Doorm...
Name: features, Length: 8700, dtype: object

In [100]:
train['features'] = train['features'].str.replace('[\'\[\]]', '').str.split(', ')
train['features_cnt'] = train['features'].apply(len)

train['photos']=train['photos'].str.replace('[\[\]\']', '').str.split(', ')
train['photos_cnt'] = train['photos'].apply(len)

train["created"] = pd.to_datetime(train["created"])
train["created_day"] = train["created"].dt.day
train["created_hour"] = train["created"].dt.hour

train["num_description_words"]=train["description"].fillna('').apply(lambda x: len(x.split(" ")))

  train['features'] = train['features'].str.replace('[\'\[\]]', '').str.split(', ')
  train['photos']=train['photos'].str.replace('[\[\]\']', '').str.split(', ')


In [101]:
target = 'interest_level'

In [102]:
train.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level',
       'price_lvl', 'features_cnt', 'photos_cnt', 'created_day',
       'created_hour', 'num_description_words'],
      dtype='object')

In [104]:
features = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'features_cnt', 'photos_cnt', 'created_day', 'created_hour', 'num_description_words']

In [108]:
train.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,photos,price,street_address,interest_level,price_lvl,features_cnt,photos_cnt,created_day,created_hour,num_description_words
11046,1.0,2,593d0fb003995388a5417df9f80eef7c,2016-06-16 05:50:16,Spacious & Renovated 2 Bed 1 Bath in Kew Garde...,Metropolitan Ave.,"[Dining Room, Elevator, Pre-War, Laundry in Bu...",40.706,7170240,-73.8324,...,[https://photos.renthop.com/2/7170240_d9461063...,2525,118-80 Metropolitan Ave.,low,0,9,12,16,5,86
13493,1.0,0,508baf62cd9cc74dd4be6bb722d020f9,2016-05-19 01:52:37,,West 38th Street,"[Doorman, Fitness Center, Pool, Elevator, Gara...",40.7551,7035556,-73.9928,...,[],3395,320 West 38th Street,low,1,10,1,19,1,1
25642,2.0,2,0,2016-04-28 01:35:23,"Two bedroom in SOHO W/ private balcony,Feature...",Thompson Street,[Doorman],40.724,6934860,-74.0035,...,[https://photos.renthop.com/2/6934860_b1f45559...,14500,55 Thompson Street,low,2,1,7,28,1,39
37226,1.0,1,a97e3b0b02e9d7044157d53740755880,2016-04-13 04:55:13,I have an access to show several thousand avai...,E 9 Street,[],40.7283,6866354,-73.9857,...,[https://photos.renthop.com/2/6866354_8e241403...,2300,344 E 9 Street,low,0,1,6,13,4,54
33484,1.0,1,7a9ae106c9bf463c73963c4d40f332b9,2016-06-21 03:38:44,"> 1 Bedroom, 1 Bathroom apartment with a south...",E 86th St.,"[Swimming Pool, Dining Room, Doorman, Elevator...",40.7779,7189556,-73.9525,...,[https://photos.renthop.com/2/7189556_1e726b87...,4000,240 E 86th St.,low,1,14,7,21,3,113


In [109]:
train.interest_level.value_counts()

low       2900
medium    2900
high      2900
Name: interest_level, dtype: int64

In [110]:
train.interest_level.value_counts(normalize=True)

low       0.333333
medium    0.333333
high      0.333333
Name: interest_level, dtype: float64

In [111]:
train.describe()

Unnamed: 0,bathrooms,bedrooms,latitude,listing_id,longitude,price,price_lvl,features_cnt,photos_cnt,created_day,created_hour,num_description_words
count,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0,8700.0
mean,1.177011,1.557586,40.741207,7022226.0,-73.95234,3399.106,0.590805,5.507241,5.71069,15.010115,5.448736,92.105747
std,0.436693,1.125127,0.619608,123680.5,1.122418,11643.18,0.793066,3.958349,3.051062,8.331663,4.601515,57.126271
min,0.0,0.0,0.0,6811965.0,-74.2432,695.0,0.0,1.0,1.0,1.0,0.0,1.0
25%,1.0,1.0,40.7269,6917060.0,-73.9913,2225.0,0.0,2.0,4.0,8.0,2.0,53.0
50%,1.0,2.0,40.7494,7018256.0,-73.97725,2856.5,0.0,5.0,5.0,15.0,4.0,85.0
75%,1.0,2.0,40.7746,7124196.0,-73.9525,3775.0,1.0,8.0,7.0,22.0,6.0,121.0
max,5.0,6.0,42.3459,7724814.0,0.0,1070000.0,2.0,39.0,37.0,31.0,23.0,563.0


In [112]:
test.describe()

Unnamed: 0,bathrooms,bedrooms,latitude,listing_id,longitude,price
count,9871.0,9871.0,9871.0,9871.0,9871.0,9871.0
mean,1.209807,1.554351,40.751866,7022246.0,-73.974056,4142.879
std,0.497154,1.099811,0.068352,125500.9,0.137458,45228.6
min,0.0,0.0,39.8395,6811957.0,-86.1527,999.0
25%,1.0,1.0,40.7283,6915255.0,-73.9917,2500.0
50%,1.0,1.0,40.7518,7019807.0,-73.9781,3150.0
75%,1.0,2.0,40.7736,7124604.0,-73.9555,4100.0
max,5.5,6.0,44.6038,7731327.0,-70.9846,4490000.0


In [113]:
import seaborn

### Удаляем выбросы

In [114]:
train = train.drop(['created'], axis=1)
train.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address', 'interest_level',
       'price_lvl', 'features_cnt', 'photos_cnt', 'created_day',
       'created_hour', 'num_description_words'],
      dtype='object')

In [115]:
#Вычисляем строки со значениями-выбросами 
first_quartile = train.quantile(q=0.25)
third_quartile = train.quantile(q=0.75)
IQR = third_quartile - first_quartile
outliers = train[(train > (third_quartile + 1.5 * IQR)) | (train < (first_quartile - 1.5 * IQR))].count(axis=1)
outliers.sort_values(axis=0, ascending=False, inplace=True)

#Удаляем из датафрейма 1500 строк, подходящих под критерии выбросов
outliers = outliers.head(1500)
train.drop(outliers.index, inplace=True)
train.shape

  outliers = train[(train > (third_quartile + 1.5 * IQR)) | (train < (first_quartile - 1.5 * IQR))].count(axis=1)


(7200, 20)

## 2. Тестирование различных алгоритмов

Для выполнения этой части задания разбейте размеченные данные на train и test и напишите автоматическое тестирование разных алгоритмов классификации в цикле с вычислением метрики f1_score('macro')

Метрику на тесте по разным алгоритмам сохраняйте в датафрейм или выводите

In [116]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, roc_auc_score, classification_report, precision_score, recall_score
import joblib

In [117]:
train[features].head()

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,features_cnt,photos_cnt,created_day,created_hour,num_description_words
13493,1.0,0,40.7551,-73.9928,3395,10,1,19,1,1
37226,1.0,1,40.7283,-73.9857,2300,1,6,13,4,54
33484,1.0,1,40.7779,-73.9525,4000,14,7,21,3,113
17747,1.0,2,40.6223,-73.9633,2300,3,5,1,1,62
16984,0.0,2,40.7216,-73.9808,3250,1,1,18,3,9


In [118]:
mapper={
        'low':0,
        'medium':1,
        'high':2
       }

In [119]:
train['interest_level'] = train['interest_level'].apply(lambda x: mapper[x])

In [120]:
train['interest_level'].value_counts(normalize=True)

1    0.341944
2    0.333194
0    0.324861
Name: interest_level, dtype: float64

In [121]:
features

['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'features_cnt',
 'photos_cnt',
 'created_day',
 'created_hour',
 'num_description_words']

In [122]:
scaler = StandardScaler()
scaled_train = scaler.fit_transform(train[features])

In [123]:
len(features)

10

In [124]:
pd.DataFrame(scaled_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0,7200.0
mean,-2.171103e-16,2.3684760000000003e-17,-9.349163e-14,2.513348e-13,6.315935e-17,-3.157968e-17,-9.079157e-17,-9.868649000000001e-17,3.157968e-17,3.94746e-17
std,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069,1.000069
min,-4.003514,-1.375377,-4.361907,-8.405899,-2.230992,-1.150702,-1.701829,-1.664405,-1.207655,-1.669034
25%,-0.271842,-0.3813282,-0.5663653,-0.6616396,-0.7463233,-0.8861477,-0.5306966,-0.8314917,-0.7307007,-0.7005208
50%,-0.271842,-0.3813282,0.004953152,-0.2008713,-0.1593612,-0.3570385,-0.140319,0.00142124,-0.2537464,-0.09282637
75%,-0.271842,0.6127207,0.5713036,0.6184586,0.5311825,0.70118,0.6404361,0.8343342,0.223208,0.5338585
max,7.191503,2.600819,3.596807,7.457936,10.77096,4.934054,10.39987,1.905222,4.27732,9.003599


In [125]:
X_train, X_test, y_train, y_test = train_test_split(scaled_train, train[target],
                                                    test_size=0.3, random_state=42)

In [126]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5040, 10), (2160, 10), (5040,), (2160,))

# LogisticRegression

In [127]:
lg = LogisticRegression(random_state = 42)
lg.fit(X_train, y_train)
predicts = lg.predict(X_test)
predicts_proba = lg.predict_proba(X_test)

In [128]:
accuracy_score(y_test, predicts)

0.5370370370370371

In [129]:
f1_score(y_test, predicts, average='macro')

0.5321710274088832

In [130]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.60      0.64      0.62       724
           1       0.43      0.37      0.40       721
           2       0.56      0.60      0.58       715

    accuracy                           0.54      2160
   macro avg       0.53      0.54      0.53      2160
weighted avg       0.53      0.54      0.53      2160



# Linear SVC

In [131]:
scv = LinearSVC(random_state = 42)
scv.fit(X_train, y_train)
predicts = scv.predict(X_test)



In [132]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.56      0.69      0.62       724
           1       0.42      0.24      0.30       721
           2       0.54      0.65      0.59       715

    accuracy                           0.52      2160
   macro avg       0.51      0.52      0.50      2160
weighted avg       0.51      0.52      0.50      2160



# KNeighborsClassifier

In [133]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predicts = knn.predict(X_test)

In [134]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.49      0.58      0.53       724
           1       0.36      0.33      0.34       721
           2       0.48      0.43      0.45       715

    accuracy                           0.45      2160
   macro avg       0.44      0.45      0.44      2160
weighted avg       0.44      0.45      0.44      2160



# DecisionTreeClassifier

In [135]:
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt.fit(X_train, y_train)
predicts = dt.predict(X_test)

In [136]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.54      0.52      0.53       724
           1       0.37      0.37      0.37       721
           2       0.49      0.51      0.50       715

    accuracy                           0.47      2160
   macro avg       0.47      0.47      0.47      2160
weighted avg       0.47      0.47      0.47      2160



In [137]:
X_train, X_test, y_train, y_test = train_test_split(train[features], train[target],
                                                    test_size=0.3, random_state=42)

In [138]:
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(X_train, y_train)
predicts = dt.predict(X_test)

In [139]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.50      0.67      0.57       724
           1       0.37      0.41      0.39       721
           2       0.57      0.30      0.39       715

    accuracy                           0.46      2160
   macro avg       0.48      0.46      0.45      2160
weighted avg       0.48      0.46      0.45      2160



In [140]:
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(dt, feature_names=features, 
                out_file=dot_data, filled=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(value=graph.create_png())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x06\xd2\x00\x00\x01\xf1\x08\x02\x00\x00\x00\x7f\x01\…

При такой визуализации чем больше объектов одного класса, тем цвет вершины ближе к темно-оранжевому и, наоборот, чем больше объектов второго класса, тем ближе цвет к темно-фиолетовому(при бинарной темно-синему). Если объектов одного касса поровну, то корневая вершина дерева – белого цвета. Зеленая - больше объектов 3 класса.

In [141]:
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(dt, feature_names=features, 
                out_file=dot_data, filled=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(value=graph.create_png())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x06\xd2\x00\x00\x01\xf1\x08\x02\x00\x00\x00\x7f\x01\…

Видим что 2(high) класс вообще не определяется.

# RandomForestClassifier

In [142]:
rf = RandomForestClassifier(n_estimators = 100, random_state=42)
rf.fit(X_train, y_train)
predicts = rf.predict(X_test)

In [143]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.64      0.62      0.63       724
           1       0.43      0.44      0.44       721
           2       0.59      0.60      0.59       715

    accuracy                           0.55      2160
   macro avg       0.55      0.55      0.55      2160
weighted avg       0.55      0.55      0.55      2160



# XGBClassifier

In [144]:
xgb = XGBClassifier(seed=42)
xgb.fit(X_train, y_train)
predicts = xgb.predict(X_test)

In [145]:
print(classification_report(y_test, predicts))

              precision    recall  f1-score   support

           0       0.68      0.61      0.64       724
           1       0.43      0.45      0.44       721
           2       0.56      0.61      0.59       715

    accuracy                           0.55      2160
   macro avg       0.56      0.55      0.56      2160
weighted avg       0.56      0.55      0.56      2160



## 3. Подбор гиперпараметров

Для этого задания подберите гиперпараметры как минимум для 3 алгоритмов из протестированных выше. Используйте Grid Search или Random Search, в качестве метрики передавайте f1_score

Не забывайте разбивать выборку на фолды, например с помощью StratifiedShuffleSplit.

In [146]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
params = {
    'max_depth': [3, 4, None, 5],
    "min_samples_leaf": list(range(1, 5))
}

rf = RandomForestClassifier(n_estimators=100, random_state=42, 
                             n_jobs=-1, oob_score=True)



dt_random_search1 = RandomizedSearchCV(rf, params, n_jobs=-1, cv=skf, verbose=1)
dt_random_search1.fit(X_train, y_train)

print(dt_random_search1.best_estimator_)
print(dt_random_search1.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
RandomForestClassifier(min_samples_leaf=3, n_jobs=-1, oob_score=True,
                       random_state=42)
{'min_samples_leaf': 3, 'max_depth': None}


In [147]:
dt = DecisionTreeClassifier(random_state=42)

dt_random_search2 = RandomizedSearchCV(dt, params, n_jobs=-1, cv=skf, verbose=1)
dt_random_search2.fit(X_train, y_train)

print(dt_random_search2.best_estimator_)
print(dt_random_search2.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
DecisionTreeClassifier(min_samples_leaf=3, random_state=42)
{'min_samples_leaf': 3, 'max_depth': None}


In [148]:
xgb = XGBClassifier(n_estimators=100, random_state=42)

dt_random_search = RandomizedSearchCV(xgb, params, n_jobs=-1, cv=skf, verbose=1)
dt_random_search.fit(X_train, y_train)

print(dt_random_search.best_estimator_)
print(dt_random_search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/xgboost/sklearn.py", line 728, in fit
    self._Booster = train(xgb_options, train_dmatrix, self.get_num_boosting_rounds(),
  File "/usr/local/lib/python3.8/dist-packages/xgboost/training.py", line 212, in train
    return _train_internal(params, dtrain,
  File "/usr/local/lib/python3.8/dist-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, 

XGBClassifier(max_depth=4, min_samples_leaf=1, objective='multi:softprob',
              random_state=42)
{'min_samples_leaf': 1, 'max_depth': 4}


## Итоговое решение и сабмит 

Далее должно приводиться ваше итоговое решение, с лучшей метрикой на лидерборде, которой вам удалось добится. Можете сделать два ноутбука если так удобнее.

Выше произведенные действия не гарантируют хороший результат на лидерборде, для него нужно поэксперементировать самостоятельно ;) 

**! ВАЖНО: ваш итоговый результат должен быть как минимум выше baseline.**

## Baseline

In [149]:
xgb = dt_random_search.best_estimator_
xgb.fit(train[features], train[target])

XGBClassifier(max_depth=4, min_samples_leaf=1, objective='multi:softprob',
              random_state=42)

## Подготовка submission

В плане предобработки\генерации признаков делаем все тоже самое. Потом предсказываем моделью значения.

In [150]:
test.columns

Index(['bathrooms', 'bedrooms', 'building_id', 'created', 'description',
       'display_address', 'features', 'latitude', 'listing_id', 'longitude',
       'manager_id', 'photos', 'price', 'street_address'],
      dtype='object')

In [151]:
test['features']=test['features'].str.replace('[\[\]\']', '').str.split(', ')
test['features_cnt']=test['features'].apply(len)
test['photos']=test['photos'].str.replace('[\[\]\']', '').str.split(', ')
test['photos_cnt']=test['photos'].apply(len)

test["price_lvl"] = test.price.apply(price_lvl)
test["created"] = pd.to_datetime(test["created"])
test["created_day"] = test["created"].dt.day
test["created_hour"] = test["created"].dt.hour

test["num_description_words"]=test["description"].fillna('').apply(lambda x: len(x.split(" ")))


  test['features']=test['features'].str.replace('[\[\]\']', '').str.split(', ')
  test['photos']=test['photos'].str.replace('[\[\]\']', '').str.split(', ')


In [153]:
test[target]=xgb.predict(test[features])

In [154]:
test[target].value_counts()

0    4928
1    3107
2    1836
Name: interest_level, dtype: int64

In [155]:
test[['listing_id', target]].to_csv('submission.csv', index=None)

 <img src='https://i.gifer.com/Xbb1.gif'><br>