In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
file = pd.read_csv('H:\Downloads\movies.csv')

In [3]:
import datetime
import re
from sklearn.preprocessing import OneHotEncoder

## Drop the column of homepages

In [4]:
file = file.drop(['homepage'],axis = 1) # Drop the rows where the blank for homepage is empty
file.isnull().sum()

budget                    0
genres                    0
id                        0
keywords                  0
original_language         0
original_title            0
overview                  3
popularity                0
production_companies      0
production_countries      0
release_date              1
revenue                   0
runtime                   2
spoken_languages          0
status                    0
tagline                 844
title                     0
vote_average              0
vote_count                0
dtype: int64

## Fill in the realise date and run time

In [5]:
file['release_date'] = file['release_date'].fillna(method = 'ffill')
file['runtime'] = file['runtime'].fillna(method = 'ffill')

In [6]:
file.isnull().sum()

budget                    0
genres                    0
id                        0
keywords                  0
original_language         0
original_title            0
overview                  3
popularity                0
production_companies      0
production_countries      0
release_date              0
revenue                   0
runtime                   0
spoken_languages          0
status                    0
tagline                 844
title                     0
vote_average              0
vote_count                0
dtype: int64

## Use the title to substitute the missing value of overview and tagline

In [8]:
file['overview'].loc[file['overview'].isna()] = file['title']
file['tagline'].loc[file['tagline'].isna()] = file['title']
file.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
vote_average            0
vote_count              0
dtype: int64

## Seperate the date into year, month and day

In [9]:
file['year'] = file['month'] = file['day'] = 0
for i in file['release_date']:
    file['year'].loc[file['release_date']==i] = datetime.datetime.strptime(i, '%d-%m-%y').year
    file['month'].loc[file['release_date']==i] = datetime.datetime.strptime(i, '%d-%m-%y').month
    file['day'].loc[file['release_date']==i] = datetime.datetime.strptime(i, '%d-%m-%y').day

## Drop the original title

In [10]:
file = file.drop(['original_title'],axis = 1) # Drop the rows where the blank for homepage is empty
file.isnull().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
vote_average            0
vote_count              0
year                    0
month                   0
day                     0
dtype: int64

## Hot-encode the orginal language and status

In [11]:
enc = OneHotEncoder()

In [12]:
ol_mapping = {'en': 1, 'ja': 2, 'fr': 3, 'zh': 4, 'es': 5, 'de': 6, 'hi': 7, 'ru': 8, 'ko': 9, 'te': 10, 'cn': 11,
       'it': 12, 'nl': 13, 'ta': 14, 'sv': 15, 'th': 16, 'da': 17, 'xx': 18, 'hu': 19, 'cs': 20, 'pt': 21, 'is': 22,
       'tr': 23, 'nb': 24, 'af': 25, 'pl': 26, 'he': 27, 'ar': 28, 'vi': 29, 'ky': 30, 'id': 31, 'ro': 32, 'fa': 33,
       'no': 34, 'sl': 35, 'ps': 36, 'el': 37}
file['original_language'] = file['original_language'].map(ol_mapping)

In [13]:
status_mapping = {'Released': 1, 'Post Production': 2, 'Rumored': 3}
file['status'] = file['status'].map(status_mapping)

In [14]:
onehot_features = ['original_language', 'status']
enc.fit(file[onehot_features])

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [15]:
enc_res = enc.transform(file[onehot_features])
file = pd.concat([file, pd.DataFrame(enc_res.toarray())], axis=1)

## One-hot-code the genres manually

In [16]:
file['genres_index'] = '0'
for i in file.genres:
        if i != '[]':
            a = i.split(', {')
            for k in range(1, len(a)):
                a[k] = '{' + a[k]
            a[0] = a[0][1:]
            a[len(a)-1] = a[len(a)-1][:-1]
            b = ['' for i in range(len(a))]
            for q in range(len(a)):
                b[q] = int(''.join(re.findall(r'"id": ([0-9]*), "name"', a[q])))
            b = ','.join(str(m) for m in b)
            file['genres_index'].loc[file['genres']==i] = b

In [17]:
genres_list = []
for i in file.genres_index:
    a = i.split(',')
    for j in range(len(a)):
        if int(a[j]) not in genres_list and a != '0':
            genres_list.append(int(a[j]))

In [18]:
for i in range(len(genres_list)):
    new_label = 'genre_' + str(i+1)
    file[new_label] = 0

In [19]:
for i in file.genres_index:
    a = i.split(',')
    for j in a:
        for k in range(len(genres_list)):
            if int(j) == genres_list[k] and int(j) != 0:
                file['genre_'+str(k+1)].loc[file['genres_index']==i] = 1

In [20]:
for i in file:
    print(i)

budget
genres
id
keywords
original_language
overview
popularity
production_companies
production_countries
release_date
revenue
runtime
spoken_languages
status
tagline
title
vote_average
vote_count
year
month
day
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
genres_index
genre_1
genre_2
genre_3
genre_4
genre_5
genre_6
genre_7
genre_8
genre_9
genre_10
genre_11
genre_12
genre_13
genre_14
genre_15
genre_16
genre_17
genre_18
genre_19
genre_20
genre_21


## One-hot-code the keywords manually

In [25]:
file['keywords_index'] = '0'
for i in file.keywords:
        if i != '[]':
            a = i.split(', {')
            for k in range(1, len(a)):
                a[k] = '{' + a[k]
            a[0] = a[0][1:]
            a[len(a)-1] = a[len(a)-1][:-1]
            b = ['' for i in range(len(a))]
            for q in range(len(a)):
                b[q] = int(''.join(re.findall(r'"id": ([0-9]*), "name"', a[q])))
            b = ','.join(str(m) for m in b)
            file['keywords_index'].loc[file['keywords']==i] = b

In [26]:
keywords_list = []
for i in file.keywords_index:
    a = i.split(',')
    for j in range(len(a)):
        if int(a[j]) not in keywords_list and a[j] != '0':
            keywords_list.append(int(a[j]))

In [27]:
for i in range(len(keywords_list)):
    file['keywords_'+str(i+1)] = 0

In [28]:
for i in file.keywords_index:
    a = i.split(',')
    for j in a:
        for k in range(len(keywords_list)):
            if int(j) == keywords_list[k] and j != '0':
                file['keywords_'+str(k+1)].loc[file['keywords_index']==i] = 1

## One-hot-code the production companies manually

In [29]:
file['production_companies_index'] = '0'
for i in file.production_companies:
        if i != '[]':
            a = i.split(', {')
            for k in range(1, len(a)):
                a[k] = '{' + a[k]
            a[0] = a[0][1:]
            a[len(a)-1] = a[len(a)-1][:-1]
            b = ['' for i in range(len(a))]
            for q in range(len(a)):
                b[q] = int(''.join(re.findall(r'"id": ([0-9]*)}', a[q])))
            b = ','.join(str(m) for m in b)
            file['production_companies_index'].loc[file['production_companies']==i] = b

In [31]:
production_companies_list = []
for i in file.production_companies_index:
    a = i.split(',')
    for j in range(len(a)):
        if int(a[j]) not in production_companies_list and a[j] != '0':
            production_companies_list.append(int(a[j]))

In [32]:
for i in range(len(production_companies_list)):
    file['production_companies_'+str(i+1)] = 0
for i in file.production_companies_index:
    a = i.split(',')
    for j in a:
        for k in range(len(production_companies_list)):
            if int(j) == production_companies_list[k] and j != '0':
                file['production_companies_'+str(k+1)].loc[file['production_companies_index']==i] = 1

## One-hot-code the production countries manually

In [48]:
file['production_countries_index'] = '0'
for i in file.production_countries:
        if i != '[]':
            a = i.split(', {')
            for k in range(1, len(a)):
                a[k] = '{' + a[k]
            a[0] = a[0][1:]
            a[len(a)-1] = a[len(a)-1][:-1]
            b = ['' for i in range(len(a))]
            for q in range(len(a)):
                b[q] = ''.join(re.findall(r'"iso_3166_1": "(.*)", "', a[q]))
            b = ','.join(str(m) for m in b)
            file['production_countries_index'].loc[file['production_countries']==i] = b

KeyboardInterrupt: 

In [None]:
production_countries_list = []
for i in file.production_countries_index:
    a = i.split(',')
    for j in range(len(a)):
        if a[j] not in production_countries_list and a[j] != '0':
            production_countries_list.append(a[j])

In [35]:
for i in range(len(production_countries_list)):
    file['production_countries_'+str(1+i)] = 0

In [38]:
for i in file.production_countries_index:
    a = i.split(',')
    for j in a:
        for k in range(len(production_countries_list)):
            if j == production_countries_list[k] and j != '0':
                file['production_countries_'+str(k+1)].loc[file['production_countries_index']==i] = 1

## One-hot-code the spoken languages manually

In [49]:
file['spoken_languages_index'] = '0'
for i in file.spoken_languages:
        if i != '[]':
            a = i.split(', {')
            for k in range(1, len(a)):
                a[k] = '{' + a[k]
            a[0] = a[0][1:]
            a[len(a)-1] = a[len(a)-1][:-1]
            b = ['' for i in range(len(a))]
            for q in range(len(a)):
                b[q] = ''.join(re.findall(r'"iso_639_1": "(.*)", "', a[q]))
            b = ','.join(str(m) for m in b)
            file['spoken_languages_index'].loc[file['spoken_languages']==i] = b

In [50]:
spoken_languages_list = []
for i in file.spoken_languages_index:
    a = i.split(',')
    for j in range(len(a)):
        if a[j] not in spoken_languages_list and a[j] != '0':
            spoken_languages_list.append(a[j])

In [51]:
for i in range(len(spoken_languages_list)):
    file['spoken_languages_'+str(1+i)] = 0

In [52]:
for i in file.spoken_languages_index:
    a = i.split(',')
    for j in a:
        for k in range(len(spoken_languages_list)):
            if j == spoken_languages_list[k] and j != '0':
                file['spoken_languages_'+str(k+1)].loc[file['spoken_languages_index']==i] = 1

In [53]:
file.to_csv(r'preprossessed_data.csv')

In [2]:
from sklearn.utils import check_random_state

random_state = check_random_state(None)

In [3]:
random_state.permutation(1400)[:3]

array([772, 439, 141])

In [5]:
file = file[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']]

In [7]:
n = file.values.shape[0]

In [9]:
s = random_state.permutation(n)[:3]
s

array([2399, 3380, 2741])

In [12]:
file.values[s].copy()

array([[1.7500000e+07, 7.1804030e+00, 7.0181890e+06, 9.3000000e+01,
        5.8000000e+00, 1.6000000e+01],
       [7.0000000e+06, 2.1276864e+01, 6.0389420e+06, 1.1200000e+02,
        7.4000000e+00, 6.7700000e+02],
       [0.0000000e+00, 1.2963328e+01, 0.0000000e+00, 8.4000000e+01,
        5.7000000e+00, 1.5700000e+02]])

In [33]:
labels = []

In [34]:
def update_labels_error(dataset, centers):
    labels = assign_points(dataset, centers)
    new_means = defaultdict(list)
    error = 0
    for assignment, point in zip(labels, dataset):
        new_means[assignment].append(point)

    for points in new_means.values():
        newCenter = np.mean(points, axis=0)
        error += np.sqrt(np.sum(np.square(points - newCenter)))

    return labels, error

In [37]:
def assign_points(dataset, centers):
    labels = []
    for point in dataset:
        shortest = float("inf")  # positive infinity
        shortest_index = 0
        for i in range(len(centers)):
            val = distance(point, centers[i])
            if val < shortest:
                shortest = val
                shortest_index = i
        labels.append(shortest_index)
    return labels

In [38]:
labels, errors = update_labels_error(file.values, centers)

In [39]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [40]:
errors

nan

In [41]:
def update_centers(dataset, labels):
    new_means = defaultdict(list)
    centers = []
    for assignment, point in zip(labels, dataset):
        new_means[assignment].append(point)

    for points in new_means.values():
        newCenter = np.mean(points, axis=0)
        centers.append(newCenter)

    return np.array(centers)

In [42]:
centers = update_centers(file.values, labels)

In [43]:
centers

array([[4.97789382e+07, 3.15988905e+01, 1.40817610e+08,            nan,
        6.22403988e+00, 1.08661189e+03],
       [5.53056135e+05, 5.03428125e+00, 3.03643271e+05, 9.71264706e+01,
        5.64095588e+00, 8.56558824e+01],
       [5.37388507e+06, 1.47089821e+01, 1.81643587e+07, 1.06042177e+02,
        6.44122449e+00, 3.48408163e+02]])

In [44]:
n

4803

In [47]:
df = pd.DataFrame([['1', 113],
                   ['2', 113],
                   ['3', 301],
                   ['4', 122],
                   ['5', 113]], columns=['num', 'num_letter'])
df

Unnamed: 0,num,num_letter
0,1,113
1,2,113
2,3,301
3,4,122
4,5,113


In [48]:
df['label'] = -1
df

Unnamed: 0,num,num_letter,label
0,1,113,-1
1,2,113,-1
2,3,301,-1
3,4,122,-1
4,5,113,-1


In [52]:
a = [1,2,2,3,4]
for i in range(len(df)):
    df['label'].loc[df.index == i] = a[i] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFram

In [53]:
df

Unnamed: 0,num,num_letter,label
0,1,113,1
1,2,113,2
2,3,301,2
3,4,122,3
4,5,113,4


In [54]:
df1 = pd.DataFrame([['1', 113],
                   ['2', 113],
                   ['3', 301],
                   ['4', 122],
                   ['5', 113]], columns=['num', 'num_letter'])

In [2]:
file = pd.read_csv('H:\Downloads\movies.csv')
file_2 = file[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']]

In [8]:
import numpy as np

np.empty((3, 4800))

array([[1.04231233e-311, 1.04231195e-311, 2.12199579e-314, ...,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 1.01463693e-317]])

In [9]:
np.log(3)

1.0986122886681098