In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.shape, tags.shape, ratings.shape

((9742, 3), (3683, 4), (100836, 4))

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Самые активные пользователи

In [6]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
movies_with_tags = pd.merge(movies, tags, on='movieId', how='left')
movies_with_tags.title.unique().shape, movies_with_tags.shape, tags.shape

((9737,), (11853, 6), (3683, 4))

In [9]:
#movies_with_tags.isna().any()
#movies_with_tags.loc[(movies_with_tags.userId.isna()) & (movies_with_tags.tag.isna()) & (movies_with_tags.timestamp.isna()), :]
#movies_with_tags.loc[movies_with_tags.movieId == 1357]
#ratings.loc[ratings.movieId == 1357]

### Вопрос: куда делись 5 фильмов? 9742  до объединения и 9737 после

### Собираем и объединяем теги для каждого фильма от разных пользователей

In [10]:
tag_strings = []
movies_list = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=9737.0), HTML(value='')))




In [11]:
movies_list[:10]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)']

In [12]:
tag_strings[:10]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'nan']

### Уберём nan

In [13]:
tag_strings = [str(s).replace('nan', '') for s in tag_strings]

In [14]:
len(movies_list), len(tag_strings)

(9737, 9737)

In [15]:
df = pd.DataFrame(movies_list, columns = ['title'])
df['tag'] = tag_strings
df.head(10)

Unnamed: 0,title,tag
0,'71 (2014),
1,'Hellboy': The Seeds of Creation (2004),
2,'Round Midnight (1986),
3,'Salem's Lot (2004),
4,'Til There Was You (1997),
5,'Tis the Season for Love (2015),
6,"'burbs, The (1989)",
7,'night Mother (1986),
8,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...
9,*batteries not included (1987),


### Объедним с предыдущим dataframe'ом и удалим лишнее

In [16]:
movies_with_tags = pd.DataFrame.merge(movies_with_tags, df, on='title')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,pixar pixar fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,pixar pixar fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,pixar pixar fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,fantasy magicboardgame RobinWilliams game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,fantasy magicboardgame RobinWilliams game


In [17]:
def change_string(s):
    return '|' + s.replace(' ', '|')

In [18]:
movies_with_tags['tag_y'] = movies_with_tags['tag_y'].apply(change_string) 
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,|pixar|pixar|fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,|pixar|pixar|fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,|pixar|pixar|fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game


In [19]:
movies_with_tags['tags_genres'] = movies_with_tags['genres'] + '|' + movies_with_tags['tag_y']
movies_with_tags.drop(['genres', 'tag_x', 'tag_y'], axis=1, inplace=True)
movies_with_tags.head()

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
0,1,Toy Story (1995),336.0,1139046000.0,Adventure|Animation|Children|Comedy|Fantasy||p...
1,1,Toy Story (1995),474.0,1137207000.0,Adventure|Animation|Children|Comedy|Fantasy||p...
2,1,Toy Story (1995),567.0,1525286000.0,Adventure|Animation|Children|Comedy|Fantasy||p...
3,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy||fantasy|magicboard...
4,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy||fantasy|magicboard...


In [20]:
def change_string(s):
    return s.replace('|', ' ')

In [21]:
movies_with_tags['tags_genres'] = movies_with_tags['tags_genres'].apply(change_string) 
movies_with_tags.head()

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
0,1,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy p...
1,1,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy p...
2,1,Toy Story (1995),567.0,1525286000.0,Adventure Animation Children Comedy Fantasy p...
3,2,Jumanji (1995),62.0,1528844000.0,Adventure Children Fantasy fantasy magicboard...
4,2,Jumanji (1995),62.0,1528844000.0,Adventure Children Fantasy fantasy magicboard...


### Добавим средние оценки юзеров, а потом и фильмов

In [22]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [23]:
df = ratings.groupby(['userId']).mean('rating').reset_index().drop(['timestamp', 'movieId'], axis=1)
df.head()

Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [24]:
### Теперь средний рейтинг нужно обратно добавить в ratings, чтобы потом соединить с movies_with_tags по movieId

In [25]:
ratings = pd.DataFrame.merge(ratings, df, on='userId')

In [26]:
ratings.rename(columns={'rating_x': 'rating', 'rating_y': 'avg_user_rating' }, inplace=True)

In [27]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,avg_user_rating
0,1,1,4.0,964982703,4.366379
1,1,3,4.0,964981247,4.366379
2,1,6,4.0,964982224,4.366379
3,1,47,5.0,964983815,4.366379
4,1,50,5.0,964982931,4.366379


In [28]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,avg_user_rating
0,1,1,4.0,964982703,4.366379
1,1,3,4.0,964981247,4.366379
2,1,6,4.0,964982224,4.366379
3,1,47,5.0,964983815,4.366379
4,1,50,5.0,964982931,4.366379


In [29]:
ratings = pd.DataFrame.merge(ratings, movies_with_tags, on=['movieId'])
ratings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,avg_user_rating,title,userId_y,timestamp_y,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy p...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy p...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),567.0,1525286000.0,Adventure Animation Children Comedy Fantasy p...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy p...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy p...


In [30]:
#переименуем столбцы
ratings.rename(columns={'userId_x': 'userId', 'timestamp_x': 'timestamp_rating'}, inplace=True)
ratings.drop(['userId_y', 'timestamp_y'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp_rating,avg_user_rating,title,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...


### Удаляю дубликаты

In [31]:
ratings.drop_duplicates(inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp_rating,avg_user_rating,title,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
6,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
9,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...
12,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...


### Средняя оценка за фильм

In [32]:
df = ratings.groupby('movieId').mean('rating').reset_index().drop(['userId', 'timestamp_rating', 'avg_user_rating'], axis=1)
df.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [33]:
ratings = ratings.merge(df, on='movieId')
ratings.rename(columns={'rating_y': 'avg_movie_ratings'}, inplace=True)

In [34]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
1,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
2,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
3,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
4,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093


### Надём id самого активного пользователя

In [35]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

In [36]:
ratings[ratings.userId == 414].head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
146,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
251,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
336,414,6,3.0,961515642,3.391957,Heat (1995),Action Crime Thriller,3.946078
511,414,47,4.0,961681857,3.391957,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twistending serialki...,3.975369
717,414,50,5.0,961681714,3.391957,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thri...,4.237745


In [37]:
user_fav_movies = ratings[(ratings.userId == 414) & (ratings.rating_x > 3)]

In [38]:
user_not_watched_movies = ratings[ratings.userId != 414]

In [39]:
user_not_watched_movies.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
1,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
2,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
3,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
4,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093


### Алгоритм
1. Определить фильмы, которые пользователь 414 ещё не смотрел
2. Обучить модель на всём дата сете. То есть рассчитываем расстояние между каждым фильмом
3. Рассчитать расстояние от фильмов, который пользователь 414 ещё не смотрел, до фильмов на всём датасете
4. Определить фильмы с кратчайшим расстоянием

### Отберём теги фильмов пользователя 414, которым он поставил оценку (> 3)

In [40]:
user414_watched = [change_string(t_g) for t_g in user_fav_movies.tags_genres.values]
user414_watched[:5], len(user414_watched)

(['Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Comedy Romance  moldy old',
  'Mystery Thriller  mystery twistending serialkiller',
  'Crime Mystery Thriller  mindfuck suspense thriller tricky twistending heist',
  'Adventure Comedy Crime Romance  crime offbeatcomedy quirky'],
 1459)

In [41]:
### удалим дубликаты

In [42]:
user414_watched = list(dict.fromkeys(user414_watched))
user414_watched[:10], len(user414_watched)

(['Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Comedy Romance  moldy old',
  'Mystery Thriller  mystery twistending serialkiller',
  'Crime Mystery Thriller  mindfuck suspense thriller tricky twistending heist',
  'Adventure Comedy Crime Romance  crime offbeatcomedy quirky',
  'Action Drama War  beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland',
  'Action Drama Romance War  ',
  'Comedy War  ',
  'Action Romance Western  ',
  'Comedy  cynical hilarious independentfilm quirky witty generationX'],
 845)

### Сделаем тоже, для тех фильмов, которые пользователь не смотрел

In [43]:
user414_not_watched = [change_string(t_g) for t_g in user_not_watched_movies.tags_genres.values]
user414_not_watched[:5], len(user414_not_watched)

(['Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Adventure Animation Children Comedy Fantasy  pixar pixar fun'],
 98138)

In [44]:
user414_not_watched = list(dict.fromkeys(user414_not_watched))
user414_not_watched[:10], len(user414_not_watched)

(['Adventure Animation Children Comedy Fantasy  pixar pixar fun',
  'Comedy Romance  moldy old',
  'Action Crime Thriller  ',
  'Mystery Thriller  mystery twistending serialkiller',
  'Crime Mystery Thriller  mindfuck suspense thriller tricky twistending heist',
  'Action Comedy Horror Thriller  ',
  'Adventure Comedy Crime Romance  crime offbeatcomedy quirky',
  'Action Drama War  beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland',
  'Action Drama Romance War  ',
  'Comedy War  '],
 2266)

In [45]:
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(summary_vector)
# X_train_counts.todense()[:5]
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# np.unique(X_train_tfidf.toarray())
# X_train_tfidf.shape

### Определяем расстояние между фильмами на всём датасете

In [46]:
summary_vector = [change_string(t_g) for t_g in ratings.tags_genres.values]
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(summary_vector)

In [47]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [48]:
neigh = NearestNeighbors(n_neighbors=1, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=1)

In [49]:
X_train_tfidf.todense(), np.unique(X_train_tfidf.toarray()), len(np.unique(X_train_tfidf.toarray()))

(matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 array([0.        , 0.02308876, 0.03753711, ..., 0.98772049, 0.98820538,
        1.        ]),
 9410)

### Посчитаем расстояние от фильмов, которые понравились пользователю 414 до фильмов, которые он не смотрел

In [76]:
res = []
for not_watched in tqdm_notebook(user414_not_watched):
    print(not_watched)
    predict = count_vect.transform([not_watched])
    X_train_tfidf = tfidf_transformer.transform(predict)
    res.append(neigh.kneighbors(X_train_tfidf, return_distance=True))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2266.0), HTML(value='')))

Adventure Animation Children Comedy Fantasy  pixar pixar fun
Comedy Romance  moldy old
Action Crime Thriller  
Mystery Thriller  mystery twistending serialkiller
Crime Mystery Thriller  mindfuck suspense thriller tricky twistending heist
Action Comedy Horror Thriller  
Adventure Comedy Crime Romance  crime offbeatcomedy quirky
Action Drama War  beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland
Action Drama Romance War  
Comedy War  
Action Romance Western  
Comedy  school AdamSandler stoplookingatmeswan
Comedy  cynical hilarious independentfilm quirky witty generationX
Adventure Comedy  
Comedy Drama  moviebusiness
Action Adventure Sci-Fi  classic spaceaction action scifi EPIC greatsoundtrack classicscifi engrossingadventure EPIC classic Nerd classicscifi spaceaction spaceepic oldiebutgoodie scifi StarWars ROBOTSANDANDROIDS space classicscifi scifi spaceadventure StarWars darthvader lukeskywalker spaceopera
Comedy Cr

KeyboardInterrupt: 

Правильно я понимаю, что лучше было переопределить датасет и исключить из него фильмы, которые пользотватель посмотрел, чтобы ускорить решение задачи?

In [51]:
# res = []
# for tag in tqdm_notebook(tag_strings):
#     test = tag
#     predict = count_vect.transform([test])
#     X_train_tfidf = tfidf_transformer.transform(predict)
#     res.append(neigh.kneighbors(X_train_tfidf, return_distance=True))

In [63]:
res[:5]

[(array([[0.]]), array([[0]], dtype=int64)),
 (array([[0.]]), array([[215]], dtype=int64)),
 (array([[0.]]), array([[267]], dtype=int64)),
 (array([[0.]]), array([[369]], dtype=int64)),
 (array([[0.]]), array([[572]], dtype=int64))]

In [64]:
res[0][0][0][0], res[0][1][0][0]

(0.0, 0)

In [65]:
movies = {}
for item in res:
    movies[item[0][0][0]] = item[1][0][0]

### Найдём топ 20 фильмов с кратчайшим расстоянием

In [66]:
list_keys = list(movies.keys())
list_keys.sort()
list_keys[:20]


[0.0]

In [67]:
rec_movies_ids = []
for key in movies:
    for item in list_keys[:20]:
        if item == key:
            rec_movies_ids.append(movies[key])
#     rec_movies_ids.append(movies[key])

In [68]:
rec_movies_ids

[100815]

### Что в итоге будем рекомендовать пользователю 414

In [58]:
ratings[ratings['movieId'].isin(rec_movies_ids)]['title'].unique()

array([], dtype=object)

### Часть 2. Предсказание

In [59]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
1,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
2,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
3,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093
4,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy p...,3.92093


In [61]:
clf = DecisionTreeClassifier(max_depth=5)

In [62]:
Xcut = X[X.columns[2:4]]

NameError: name 'X' is not defined

In [None]:
clf.fit(Xcut, y)