In [1]:
# Author: Xiang Zhang (zhan6668)
# Description: This IPython notebook pre-process the movie data for Avatar-Project1-Phase3

import os, sys, re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

In [70]:
# Design a function to rename the titles
def renameTitle(x):
    title = x
    year = ''
    if "(" in x:
        #print(x.split(" ("))
        title = x.rsplit(" (",1)[0]
        year = x.rsplit(" (",1)[-1][:-1]
    return [title, year]

# Design a function to rename the genres
def renameGenres(x):
    genre_list = [x]
    if "|" in x:
        genre_list = x.split("|")
    return genre_list

In [2]:
# 1. links.csv
df_link = pd.read_csv('./GroupLens-MovieLens-25m/links.csv', index_col=0)

In [4]:
df_link

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862.0
2,113497,8844.0
3,113228,15602.0
4,114885,31357.0
5,113041,11862.0
...,...,...
209157,6671244,499546.0
209159,297986,63407.0
209163,6755366,553036.0
209169,249603,162892.0


In [63]:
# 2. movies.csv
df_movie = pd.read_csv('./GroupLens-MovieLens-25m/movies.csv', index_col=0)

In [71]:
mv_title_list = []
mv_year_list = []
for mv in df_movie['title'].tolist():
    mv = mv.replace(u'\xa0', u' ')
    [title, year] = renameTitle(mv)
    mv_title_list.append(title)
    mv_year_list.append(year)

In [74]:
df_movie['movie_title'] = mv_title_list
df_movie['movie_year'] = mv_year_list

In [78]:
mv_genre_list = []
for mv in df_movie['genres'].tolist():
    genre_list = renameGenres(mv)
    mv_genre_list.append(genre_list)

In [80]:
df_movie['movie_genres'] = mv_genre_list

In [81]:
df_movie

Unnamed: 0_level_0,title,genres,movie_title,movie_year,movie_genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995,"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,"[Comedy, Romance]"
4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995,"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,[Comedy]
...,...,...,...,...,...
209157,We (2018),Drama,We,2018,[Drama]
209159,Window of the Soul (2001),Documentary,Window of the Soul,2001,[Documentary]
209163,Bad Poems (2018),Comedy|Drama,Bad Poems,2018,"[Comedy, Drama]"
209169,A Girl Thing (2001),(no genres listed),A Girl Thing,2001,[(no genres listed)]


In [75]:
df_movie

Unnamed: 0_level_0,title,genres,movie_title,movie_year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
...,...,...,...,...
209157,We (2018),Drama,We,2018
209159,Window of the Soul (2001),Documentary,Window of the Soul,2001
209163,Bad Poems (2018),Comedy|Drama,Bad Poems,2018
209169,A Girl Thing (2001),(no genres listed),A Girl Thing,2001


In [91]:
df_movie_new = df_movie.join(df_link, on='movieId')[['imdbId','movie_title', 'movie_year', 'movie_genres']]

In [110]:
def renameIMDbId(x):
    x_len = len(str(x))
    if x_len == 7:
        x = 'tt' + str(x)
    elif x_len<7:
        x = 'tt' + (7-x_len)*'0' + str(x)
    else:
        x = 'tt' + str(x)
        #print('larger than 7')
        #print(x)
    return x

In [111]:
df_movie_new['imdbId'].apply(renameIMDbId)

movieId
1         tt0114709
2         tt0113497
3         tt0113228
4         tt0114885
5         tt0113041
            ...    
209157    tt6671244
209159    tt0297986
209163    tt6755366
209169    tt0249603
209171    tt0055323
Name: imdbId, Length: 62423, dtype: object

In [113]:
df_movie_new['imdbId'] = df_movie_new['imdbId'].apply(renameIMDbId)

In [116]:
# 3. read in IMDb ratings
df_imdb_rating = pd.read_csv('./IMDb/title.ratings.tsv', sep='\t', index_col=0)

In [122]:
df_movie_new = df_movie_new.join(df_imdb_rating['averageRating'], on='imdbId')

In [123]:
df_movie_new.set_index('imdbId').to_csv('movies_info.csv')

In [199]:
df_movie_new

Unnamed: 0_level_0,imdbId,movie_title,movie_year,movie_genres,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,tt0114709,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",8.3
2,tt0113497,Jumanji,1995,"[Adventure, Children, Fantasy]",7.0
3,tt0113228,Grumpier Old Men,1995,"[Comedy, Romance]",6.7
4,tt0114885,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",6.0
5,tt0113041,Father of the Bride Part II,1995,[Comedy],6.1
...,...,...,...,...,...
209157,tt6671244,We,2018,[Drama],5.7
209159,tt0297986,Window of the Soul,2001,[Documentary],7.9
209163,tt6755366,Bad Poems,2018,"[Comedy, Drama]",7.7
209169,tt0249603,A Girl Thing,2001,[(no genres listed)],6.2


In [2]:
# 4. now we turn to human - actor/actress/director
df_name = pd.read_csv('./IMDb/name.basics.tsv',sep='\t',index_col=0)

In [131]:
df_name

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0072308,tt0053137"
nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0038355,tt0071877"
nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0059956,tt0057345,tt0054452,tt0049189"
nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0080455,tt0077975"
nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0060827,tt0050976,tt0050986"
...,...,...,...,...,...
nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department",tt2455546
nm9993716,Essias Loberg,\N,\N,,\N
nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
nm9993718,Aayush Nair,\N,\N,cinematographer,\N


In [9]:
print(df_name.shape[0])
print(df_name[df_name['primaryProfession'].isnull()].shape[0])

10701883
2246089


In [138]:
# identify whether this person is an actor/actress/director
label_list = []
counter = 0
for row in df_name.iterrows():
    #idx = row[0]
    local_label_list = []
    if pd.isnull(row[1]['primaryProfession']):
        local_label_list.append('')
    else:
        if 'actor' in row[1]['primaryProfession']:
            local_label_list.append('actor')
        if 'actress' in row[1]['primaryProfession']:
            local_label_list.append('actress')
        if 'director' in row[1]['primaryProfession']:
            local_label_list.append('director')
        if len(local_label_list) > 1:
            #print(local_label_list)
            counter += 1
        
    label_list.append(local_label_list)

In [139]:
counter

167821

In [142]:
df_name['professionLabel'] = label_list

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0072308,tt0053137",[actor]
nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0038355,tt0071877",[actress]
nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0059956,tt0057345,tt0054452,tt0049189",[actress]
nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0080455,tt0077975",[actor]
nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0060827,tt0050976,tt0050986","[actor, director]"
...,...,...,...,...,...,...
nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department",tt2455546,[]
nm9993716,Essias Loberg,\N,\N,,\N,[]
nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744,[]
nm9993718,Aayush Nair,\N,\N,cinematographer,\N,[]


In [151]:
df_name_simp = df_name[['primaryName', 'knownForTitles','professionLabel']]

In [159]:
df_name_simp_1work = df_name_simp[df_name_simp['professionLabel'].str.len()==1]

In [165]:
df_name_simp_1work = df_name_simp_1work[df_name_simp_1work['knownForTitles'].str.len()>2]

In [188]:
link_list = []
for row in df_name_simp_1work.iterrows():
    profess = row[1]['professionLabel'][0]
    if profess == 'actor' or profess == 'actress':
        link_list.append('acted_in')
    elif profess == 'director':
        link_list.append('directed')
    else:
        link_list.append('null')

In [190]:
len(link_list)

5096620

In [192]:
df_name_simp_1work['link'] = link_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [196]:
df_name_simp_1work = df_name_simp_1work[df_name_simp_1work['link']!='null']

In [198]:
df_name_simp_1work.to_csv('./input_Neo4j/workers_1label.csv')

In [207]:
df_name_simp_1work[df_name_simp_1work['link']=='acted_in'].to_csv('./input_Neo4j/workers_1label_act.csv')

In [208]:
df_name_simp_1work[df_name_simp_1work['link']=='directed'].to_csv('./input_Neo4j/workers_1label_direct.csv')

In [200]:
# now look at multi-label
df_name_simp[df_name_simp['professionLabel'].str.len()>2]

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm5510978,Brigitte Toubia,"tt2692346,tt4933766","[actor, actress, director]"
nm7434057,Lisa Conlin,"tt7061882,tt4832800","[actor, actress, director]"


In [203]:
# manually revise after checking
df_name_simp.loc['nm5510978']['professionLabel'] = ['actress', 'director']
df_name_simp.loc['nm7434057']['professionLabel'] = ['actress', 'director']

In [204]:
df_name_simp[df_name_simp['professionLabel'].str.len()>2]

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [205]:
df_name_simp[df_name_simp['professionLabel'].str.len()==2]

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000005,Ingmar Bergman,"tt0083922,tt0060827,tt0050976,tt0050986","[actor, director]"
nm0000008,Marlon Brando,"tt0047296,tt0070849,tt0078788,tt0068646","[actor, director]"
nm0000010,James Cagney,"tt0035575,tt0042041,tt0029870,tt0031867","[actor, director]"
nm0000024,John Gielgud,"tt0071877,tt0082031,tt0117631,tt0045943","[actor, director]"
nm0000032,Charlton Heston,"tt0070723,tt0063442,tt0052618,tt0049833","[actor, director]"
...,...,...,...
nm9993023,Tajda Meze,tt7708648,"[actress, director]"
nm9993125,Alejandro Bordier,"tt8742880,tt9121640,tt8866550","[actor, director]"
nm9993142,Rebekah Lee,"tt11343630,tt10962838,tt8141238","[actress, director]"
nm9993379,Fanny Hozleiter,tt8743182,"[actress, director]"


In [223]:
df_name_simp[df_name_simp['professionLabel'].str.len()==2].to_csv('./input_Neo4j/workers_2labels.csv')

In [224]:
df_name_simp_2work = df_name_simp[df_name_simp['professionLabel'].str.len()==2]

In [209]:
df_movie_new

Unnamed: 0_level_0,imdbId,movie_title,movie_year,movie_genres,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,tt0114709,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",8.3
2,tt0113497,Jumanji,1995,"[Adventure, Children, Fantasy]",7.0
3,tt0113228,Grumpier Old Men,1995,"[Comedy, Romance]",6.7
4,tt0114885,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",6.0
5,tt0113041,Father of the Bride Part II,1995,[Comedy],6.1
...,...,...,...,...,...
209157,tt6671244,We,2018,[Drama],5.7
209159,tt0297986,Window of the Soul,2001,[Documentary],7.9
209163,tt6755366,Bad Poems,2018,"[Comedy, Drama]",7.7
209169,tt0249603,A Girl Thing,2001,[(no genres listed)],6.2


In [215]:
df_crew = pd.read_csv('./IMDb/title.crew.tsv', sep='\t',index_col=0)

In [216]:
df_crew

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,nm0005690,\N
tt0000002,nm0721526,\N
tt0000003,nm0721526,\N
tt0000004,nm0721526,\N
tt0000005,nm0005690,\N
...,...,...
tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
tt9916850,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
tt9916852,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
tt9916856,nm10538645,nm6951431


In [219]:
df_movie_new.join(df_crew['directors'], on='imdbId').set_index('imdbId').to_csv('./input_Neo4j/movies_info.csv')

In [221]:
df_movie_new = df_movie_new.join(df_crew['directors'], on='imdbId')

In [222]:
df_movie_new

Unnamed: 0_level_0,imdbId,movie_title,movie_year,movie_genres,averageRating,directors
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,tt0114709,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",8.3,nm0005124
2,tt0113497,Jumanji,1995,"[Adventure, Children, Fantasy]",7.0,nm0002653
3,tt0113228,Grumpier Old Men,1995,"[Comedy, Romance]",6.7,nm0222043
4,tt0114885,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",6.0,nm0001845
5,tt0113041,Father of the Bride Part II,1995,[Comedy],6.1,nm0796124
...,...,...,...,...,...,...
209157,tt6671244,We,2018,[Drama],5.7,nm1415482
209159,tt0297986,Window of the Soul,2001,[Documentary],7.9,"nm1065588,nm0142504"
209163,tt6755366,Bad Poems,2018,"[Comedy, Drama]",7.7,nm2520391
209169,tt0249603,A Girl Thing,2001,[(no genres listed)],6.2,nm0003022


In [2]:
# 0320 change the format of movie_genres
df_movie_new = pd.read_csv('./input_Neo4j/movies_info.csv',index_col=0)

In [16]:
def reformat_genres(x):
    return ",".join(eval(x))

In [18]:
df_movie_new['movie_genres'] = df_movie_new['movie_genres'].apply(reformat_genres)

In [19]:
df_movie_new

Unnamed: 0_level_0,movie_title,movie_year,movie_genres,averageRating,directors
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0114709,Toy Story,1995,"Adventure,Animation,Children,Comedy,Fantasy",8.3,nm0005124
tt0113497,Jumanji,1995,"Adventure,Children,Fantasy",7.0,nm0002653
tt0113228,Grumpier Old Men,1995,"Comedy,Romance",6.7,nm0222043
tt0114885,Waiting to Exhale,1995,"Comedy,Drama,Romance",6.0,nm0001845
tt0113041,Father of the Bride Part II,1995,Comedy,6.1,nm0796124
...,...,...,...,...,...
tt6671244,We,2018,Drama,5.7,nm1415482
tt0297986,Window of the Soul,2001,Documentary,7.9,"nm1065588,nm0142504"
tt6755366,Bad Poems,2018,"Comedy,Drama",7.7,nm2520391
tt0249603,A Girl Thing,2001,(no genres listed),6.2,nm0003022


In [20]:
df_movie_new.to_csv('./input_Neo4j/movies_info_0321.csv')

In [225]:
df_name_simp_2work

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000005,Ingmar Bergman,"tt0083922,tt0060827,tt0050976,tt0050986","[actor, director]"
nm0000008,Marlon Brando,"tt0047296,tt0070849,tt0078788,tt0068646","[actor, director]"
nm0000010,James Cagney,"tt0035575,tt0042041,tt0029870,tt0031867","[actor, director]"
nm0000024,John Gielgud,"tt0071877,tt0082031,tt0117631,tt0045943","[actor, director]"
nm0000032,Charlton Heston,"tt0070723,tt0063442,tt0052618,tt0049833","[actor, director]"
...,...,...,...
nm9993023,Tajda Meze,tt7708648,"[actress, director]"
nm9993125,Alejandro Bordier,"tt8742880,tt9121640,tt8866550","[actor, director]"
nm9993142,Rebekah Lee,"tt11343630,tt10962838,tt8141238","[actress, director]"
nm9993379,Fanny Hozleiter,tt8743182,"[actress, director]"


In [226]:
# 5. customer rating
df_cratings = pd.read_csv('./GroupLens-MovieLens-25m/ratings.csv')

In [229]:
df_cratings.shape

(25000095, 4)

In [232]:
df_cratings = df_cratings.join(df_movie_new, on='movieId')[['userId','imdbId','rating']]

In [233]:
df_cratings

Unnamed: 0,userId,imdbId,rating
0,1,tt0110912,5.0
1,1,tt0111495,3.5
2,1,tt0108394,5.0
3,1,tt0114787,5.0
4,1,tt0045152,3.5
...,...,...,...
25000090,162541,tt0382932,4.5
25000091,162541,tt0389790,2.5
25000092,162541,tt0952640,2.0
25000093,162541,tt0468569,4.0


In [234]:
def renameCustId(x):
    x_len = len(str(x))
    if x_len == 6:
        x = 'c' + str(x)
    elif x_len<6:
        x = 'c' + (6-x_len)*'0' + str(x)
    else:
        x = 'c' + str(x)
        #print('larger than 6')
        #print(x)
    return x

In [236]:
df_cratings['userId'] = df_cratings['userId'].apply(renameCustId)

In [239]:
df_cratings.set_index('userId').to_csv('./input_Neo4j/customer_ratings.csv')

In [241]:
df_cratings.set_index('userId')

Unnamed: 0_level_0,imdbId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
c000001,tt0110912,5.0
c000001,tt0111495,3.5
c000001,tt0108394,5.0
c000001,tt0114787,5.0
c000001,tt0045152,3.5
...,...,...
c162541,tt0382932,4.5
c162541,tt0389790,2.5
c162541,tt0952640,2.0
c162541,tt0468569,4.0


In [243]:
df_movie_new.set_index('imdbId')

Unnamed: 0_level_0,movie_title,movie_year,movie_genres,averageRating,directors
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0114709,Toy Story,1995,"[Adventure, Animation, Children, Comedy, Fantasy]",8.3,nm0005124
tt0113497,Jumanji,1995,"[Adventure, Children, Fantasy]",7.0,nm0002653
tt0113228,Grumpier Old Men,1995,"[Comedy, Romance]",6.7,nm0222043
tt0114885,Waiting to Exhale,1995,"[Comedy, Drama, Romance]",6.0,nm0001845
tt0113041,Father of the Bride Part II,1995,[Comedy],6.1,nm0796124
...,...,...,...,...,...
tt6671244,We,2018,[Drama],5.7,nm1415482
tt0297986,Window of the Soul,2001,[Documentary],7.9,"nm1065588,nm0142504"
tt6755366,Bad Poems,2018,"[Comedy, Drama]",7.7,nm2520391
tt0249603,A Girl Thing,2001,[(no genres listed)],6.2,nm0003022


In [247]:
df_df_name_simp_1work_performer = df_name_simp_1work[df_name_simp_1work['link']=='acted_in']

In [253]:
df_df_name_simp_1work_performer.shape[0]

3487727

In [255]:
df_df_name_simp_1work_performer['professionLabel_2'] = ['performer']*3487727

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [261]:
df_df_name_simp_1work_performer.to_csv('./input_Neo4j/workers_1label_act.csv')

In [263]:
df_df_name_simp_1work_performer

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel,link,professionLabel_2
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,"tt0050419,tt0031983,tt0072308,tt0053137",[actor],acted_in,performer
nm0000002,Lauren Bacall,"tt0037382,tt0117057,tt0038355,tt0071877",[actress],acted_in,performer
nm0000003,Brigitte Bardot,"tt0059956,tt0057345,tt0054452,tt0049189",[actress],acted_in,performer
nm0000004,John Belushi,"tt0078723,tt0072562,tt0080455,tt0077975",[actor],acted_in,performer
nm0000006,Ingrid Bergman,"tt0034583,tt0038787,tt0038109,tt0077711",[actress],acted_in,performer
...,...,...,...,...,...
nm9993694,Chinmay Mishra,"tt9083282,tt10140990,tt8737752,tt9097592",[actor],acted_in,performer
nm9993698,Sebi John,tt8736744,[actor],acted_in,performer
nm9993699,Dani Jacob,tt8736744,[actor],acted_in,performer
nm9993701,Sanjai Kuriakose,tt8736744,[actor],acted_in,performer


In [257]:
df_name_simp_1work_director = df_name_simp_1work[df_name_simp_1work['link']=='directed']

In [258]:
df_name_simp_1work_director.shape

(632346, 4)

In [259]:
df_name_simp_1work_director['professionLabel_2'] = ['director']*632346

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [262]:
df_name_simp_1work_director.to_csv('./input_Neo4j/workers_1label_direct.csv')

In [264]:
df_name_simp_1work_director

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel,link,professionLabel_2
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000019,Federico Fellini,"tt0056801,tt0047528,tt0053779,tt0050783",[director],directed,director
nm0000040,Stanley Kubrick,"tt0072684,tt0120663,tt0062622,tt0066921",[director],directed,director
nm0000041,Akira Kurosawa,"tt0080979,tt0089881,tt0051808,tt0047478",[director],directed,director
nm0000076,François Truffaut,"tt0075860,tt0070460,tt0053198,tt0055032",[director],directed,director
nm0000083,Alan Miller,"tt0424773,tt5969604,tt0320978,tt0969216",[director],directed,director
...,...,...,...,...,...
nm9993456,Fernando Balmas,tt0131531,[director],directed,director
nm9993573,Lakisha Louissaint,tt10299418,[director],directed,director
nm9993679,Art Jones,tt8744074,[director],directed,director
nm9993708,Eli Bevins,"tt11772858,tt11772904,tt11702702,tt11772940",[director],directed,director


In [246]:
df_name_simp_2work

Unnamed: 0_level_0,primaryName,knownForTitles,professionLabel
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000005,Ingmar Bergman,"tt0083922,tt0060827,tt0050976,tt0050986","[actor, director]"
nm0000008,Marlon Brando,"tt0047296,tt0070849,tt0078788,tt0068646","[actor, director]"
nm0000010,James Cagney,"tt0035575,tt0042041,tt0029870,tt0031867","[actor, director]"
nm0000024,John Gielgud,"tt0071877,tt0082031,tt0117631,tt0045943","[actor, director]"
nm0000032,Charlton Heston,"tt0070723,tt0063442,tt0052618,tt0049833","[actor, director]"
...,...,...,...
nm9993023,Tajda Meze,tt7708648,"[actress, director]"
nm9993125,Alejandro Bordier,"tt8742880,tt9121640,tt8866550","[actor, director]"
nm9993142,Rebekah Lee,"tt11343630,tt10962838,tt8141238","[actress, director]"
nm9993379,Fanny Hozleiter,tt8743182,"[actress, director]"


In [2]:
# 0322: get the subset of customer ratings
df_movie_new = pd.read_csv('./input_Neo4j/movies_info_0321.csv',index_col=0)

In [3]:
df_movie_new

Unnamed: 0_level_0,movie_title,movie_year,movie_genres,averageRating,directors
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0114709,Toy Story,1995,"Adventure,Animation,Children,Comedy,Fantasy",8.3,nm0005124
tt0113497,Jumanji,1995,"Adventure,Children,Fantasy",7.0,nm0002653
tt0113228,Grumpier Old Men,1995,"Comedy,Romance",6.7,nm0222043
tt0114885,Waiting to Exhale,1995,"Comedy,Drama,Romance",6.0,nm0001845
tt0113041,Father of the Bride Part II,1995,Comedy,6.1,nm0796124
...,...,...,...,...,...
tt6671244,We,2018,Drama,5.7,nm1415482
tt0297986,Window of the Soul,2001,Documentary,7.9,"nm1065588,nm0142504"
tt6755366,Bad Poems,2018,"Comedy,Drama",7.7,nm2520391
tt0249603,A Girl Thing,2001,(no genres listed),6.2,nm0003022


In [8]:
df_movie_new[df_movie_new['averageRating'].isnull()]

Unnamed: 0_level_0,movie_title,movie_year,movie_genres,averageRating,directors
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0118114,Wallace & Gromit: The Best of Aardman Animation,1996,"Adventure,Animation,Comedy",,
tt0125877,Low Life,1994,Drama,,nm0478286
tt0038426,Costa Brava,1946,Drama,,nm0281541
tt0115548,Anna,1996,Drama,,nm0887814
tt0087690,"Master, The",1984,Action,,
...,...,...,...,...,...
tt7725112,L'Isola di Medea,2016,(no genres listed),,nm5398596
tt6552104,The Fellowship of the Farmers,,(no genres listed),,nm8794249
tt7342916,Solace,2017,"Animation,Sci-Fi",,nm8496177
tt7575076,"China, o Império do Centro",1987,Documentary,,nm1098834


In [9]:
df_movie_new.loc['tt0154827']

movie_title      Man of the Century
movie_year                     1999
movie_genres                 Comedy
averageRating                   NaN
directors                       NaN
Name: tt0154827, dtype: object

In [6]:
df_ratings = pd.read_csv('./input_Neo4j/customer_ratings.csv',index_col=0)

In [10]:
df_ratings

Unnamed: 0_level_0,imdbId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
c000001,tt0110912,5.0
c000001,tt0111495,3.5
c000001,tt0108394,5.0
c000001,tt0114787,5.0
c000001,tt0045152,3.5
...,...,...
c162541,tt0382932,4.5
c162541,tt0389790,2.5
c162541,tt0952640,2.0
c162541,tt0468569,4.0


In [9]:
len(df_ratings['imdbId'].unique())

59047

In [11]:
np.intersect1d(df_movie_new.index, df_ratings['imdbId'].unique()).shape

(59047,)

In [14]:
df_ratings['imdbId'].value_counts()

tt0109830    81491
tt0111161    81482
tt0110912    79672
tt0102926    74127
tt0133093    72674
             ...  
tt0103292        1
tt1486196        1
tt0094068        1
tt0140428        1
tt0045887        1
Name: imdbId, Length: 59047, dtype: int64

In [18]:
(df_ratings['imdbId'].value_counts()<=3).value_counts()

False    36193
True     22854
Name: imdbId, dtype: int64

In [20]:
# partition to 10 pieces
df_ratings.iloc[:2500000].to_csv('./input_Neo4j/customer_ratings_part1.csv')

In [21]:
df_ratings.iloc[:2500000]

Unnamed: 0_level_0,imdbId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
c000001,tt0110912,5.0
c000001,tt0111495,3.5
c000001,tt0108394,5.0
c000001,tt0114787,5.0
c000001,tt0045152,3.5
...,...,...
c016658,tt0113762,2.5
c016658,tt0063803,3.0
c016658,tt0099691,0.5
c016658,tt0093624,2.5


In [23]:
df_ratings.iloc[2500000:5000000].to_csv('./input_Neo4j/customer_ratings_part2.csv')
df_ratings.iloc[5000000:7500000].to_csv('./input_Neo4j/customer_ratings_part3.csv')
df_ratings.iloc[7500000:10000000].to_csv('./input_Neo4j/customer_ratings_part4.csv')
df_ratings.iloc[10000000:12500000].to_csv('./input_Neo4j/customer_ratings_part5.csv')
df_ratings.iloc[12500000:15000000].to_csv('./input_Neo4j/customer_ratings_part6.csv')
df_ratings.iloc[15000000:17500000].to_csv('./input_Neo4j/customer_ratings_part7.csv')
df_ratings.iloc[17500000:20000000].to_csv('./input_Neo4j/customer_ratings_part8.csv')
df_ratings.iloc[20000000:22500000].to_csv('./input_Neo4j/customer_ratings_part9.csv')
df_ratings.iloc[22500000:].to_csv('./input_Neo4j/customer_ratings_part10.csv')