## Load Modules and Packages

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

np.random.seed(0)

plt.style.use("seaborn")
%matplotlib inline

## Load Datasets

In [2]:
df_ratings = pd.read_csv("./Netflix Shows.csv", sep=",", engine="python")
df_people_interest = pd.read_csv("./kaggle_Interests_group.csv", sep=",")

In [3]:
df_ratings.head()

Unnamed: 0,title,rating,ratingLevel,ratingDescription,release year,user rating score,user rating size
0,White Chicks,PG-13,"crude and sexual humor, language and some drug...",80,2004,82.0,80
1,Lucky Number Slevin,R,"strong violence, sexual content and adult lang...",100,2006,,82
2,Grey's Anatomy,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2016,98.0,80
3,Prison Break,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2008,98.0,80
4,How I Met Your Mother,TV-PG,Parental guidance suggested. May not be suitab...,70,2014,94.0,80


In [4]:
df_people_interest.head()

Unnamed: 0,group,grand_tot_interests,interest1,interest2,interest3,interest4,interest5,interest6,interest7,interest8,...,interest208,interest209,interest210,interest211,interest212,interest213,interest214,interest215,interest216,interest217
0,C,17,,,,,,,,,...,,,,,,,,,,
1,C,43,1.0,,,,1.0,,,,...,,,1.0,,,,,1.0,1.0,
2,C,27,,,,,,,,,...,,,1.0,,,,,1.0,1.0,
3,C,34,,,,,,,,,...,,,,1.0,,,,1.0,1.0,
4,C,36,,,,,1.0,,,,...,,,1.0,,,,,1.0,1.0,


## Data Wrangling

In [5]:
df_ratings = df_ratings.loc[:200, ["title", "rating", "user rating score", "ratingDescription"]]
df_people_interest = df_people_interest.iloc[:201, 2:9]

In [6]:
size = df_people_interest.shape[0]
df_people_interest = df_people_interest.apply(lambda x: np.random.randint(2, size=size), axis=0)

In [7]:
df_people_rating_set = df_ratings.join(df_people_interest)

In [8]:
df_people_rating_set['user rating score'].fillna(df_ratings[["user rating score"]].mean()[0], inplace=True)

In [9]:
df_people_rating_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              201 non-null    object 
 1   rating             201 non-null    object 
 2   user rating score  201 non-null    float64
 3   ratingDescription  201 non-null    int64  
 4   interest1          201 non-null    int64  
 5   interest2          201 non-null    int64  
 6   interest3          201 non-null    int64  
 7   interest4          201 non-null    int64  
 8   interest5          201 non-null    int64  
 9   interest6          201 non-null    int64  
 10  interest7          201 non-null    int64  
dtypes: float64(1), int64(8), object(2)
memory usage: 17.4+ KB


In [10]:
df_people_rating_set.head()

Unnamed: 0,title,rating,user rating score,ratingDescription,interest1,interest2,interest3,interest4,interest5,interest6,interest7
0,White Chicks,PG-13,82.0,80,0,0,0,1,0,0,0
1,Lucky Number Slevin,R,87.699187,100,1,1,0,1,1,1,0
2,Grey's Anatomy,TV-14,98.0,90,1,0,1,1,1,0,0
3,Prison Break,TV-14,98.0,90,0,0,1,0,0,1,0
4,How I Met Your Mother,TV-PG,94.0,70,1,1,1,0,0,1,0


In [11]:
X = df_people_rating_set
X.drop(axis=1, columns=["title", "rating", "ratingDescription"], inplace=True)

In [12]:
scalar = StandardScaler()
X = scalar.fit_transform(X)

### Kmeans Modeling

In [13]:
model = KMeans(n_clusters=5, n_init=100)
model.fit(X)

KMeans(n_clusters=5, n_init=100)

In [14]:
print(model.labels_)

[1 3 0 1 3 3 0 0 1 3 3 1 1 0 1 1 4 1 2 0 0 0 4 2 3 1 0 3 4 4 3 0 0 0 3 3 1
 4 0 0 4 1 3 3 4 4 0 0 0 4 4 3 1 4 2 4 4 1 1 3 4 0 0 0 4 4 1 4 4 4 4 4 0 1
 1 4 1 3 0 3 3 1 1 0 3 1 1 2 3 3 3 0 4 0 1 0 0 3 2 3 1 4 0 0 2 2 3 2 3 3 1
 2 4 4 4 4 3 4 3 3 4 3 1 1 1 0 4 4 4 0 3 4 2 3 3 4 3 1 0 0 0 4 3 1 4 0 1 2
 1 3 0 3 1 2 1 1 4 3 2 1 3 1 4 1 1 1 0 4 3 2 1 4 1 0 4 3 2 0 4 2 1 4 0 4 3
 2 1 4 3 1 3 4 2 3 4 4 0 1 3 1 4]


In [15]:
print(model.cluster_centers_)

[[ 0.3068329  -0.06493261 -1.11055542 -0.22519406 -0.06493261  1.07763181
  -0.32522502  0.01547835]
 [ 0.10326131  0.00635193 -1.11055542  0.0461665  -0.036206   -0.92796073
   0.08873287 -0.23435299]
 [-2.46741177 -0.12605052  0.56528271  0.24717433 -0.23717401  0.07483554
   0.02488332  0.80720404]
 [ 0.31514917  0.11552228  0.90045034  0.15535848 -0.01492704 -0.05596397
   0.11186676 -1.0884772 ]
 [ 0.25579905 -0.01492704  0.90045034 -0.09515382  0.18509524  0.03472369
   0.0648957   0.9187147 ]]


In [16]:
df_people_rating_set["k_clusters"] = model.labels_

In [17]:
df_people_rating_set.head()

Unnamed: 0,user rating score,interest1,interest2,interest3,interest4,interest5,interest6,interest7,k_clusters
0,82.0,0,0,0,1,0,0,0,1
1,87.699187,1,1,0,1,1,1,0,3
2,98.0,1,0,1,1,1,0,0,0
3,98.0,0,0,1,0,0,1,0,1
4,94.0,1,1,1,0,0,1,0,3


In [18]:
df_people_rating_set.groupby("k_clusters").mean()

Unnamed: 0_level_0,user rating score,interest1,interest2,interest3,interest4,interest5,interest6,interest7
k_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,90.432114,0.475,0.0,0.375,0.475,1.0,0.325,0.55
1,88.618924,0.510638,0.0,0.510638,0.489362,0.0,0.531915,0.425532
2,65.722222,0.444444,0.833333,0.611111,0.388889,0.5,0.5,0.944444
3,90.506186,0.565217,1.0,0.565217,0.5,0.434783,0.543478,0.0
4,89.977561,0.5,1.0,0.44,0.6,0.48,0.52,1.0


### Print Scatter Plot

In [22]:
fig = plt.figure(figsize=(40, 20))

