## Reading in the data

In [1]:
## Import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

import math



In [2]:
anp_df = pd.read_pickle(r'anp.pickle') #feather.read_dataframe('data_science_case/anp.feather')
face_df = pd.read_pickle(r'face.pickle') #feather.read_dataframe('data_science_case/face.feather')
image_df = pd.read_pickle(r'image_data.pickle') #feather.read_dataframe('data_science_case/image_data.feather')
metrics_df = pd.read_pickle(r'image_metrics.pickle') #feather.read_dataframe('data_science_case/image_metrics.feather')
object_labels_df = pd.read_pickle(r'object_labels.pickle') #feather.read_dataframe('data_science_case/object_labels.feather')
survey_df = pd.read_pickle(r'survey.pickle') #feather.read_dataframe('data_science_case/survey.feather')
celebrity_df = pd.read_pickle(r'celebrity.pickle') #feather.read_dataframe('data_science_case/survey.feather')


## Data aggregatie

In [3]:
#####################################
## OBJECTS DATA
#####################################
df = object_labels_df.groupby("data_amz_label").count().reset_index() ## 2101 possible objects , dit is te veel
df[df['image_id']>500].reset_index() ## 48 objecten hebben meer dan 500 observaties, laten we 48 kolommen maken
list_met_columns = df[df['image_id']>500].data_amz_label.tolist() ## een lijst met alle kollommen die we willen
df = object_labels_df[object_labels_df.data_amz_label.isin(list_met_columns)] ## Filter alle labels die niet vaker voor komen dan 500 keer resulteert in 108k regels
df = df[['image_id','data_amz_label']] ## selecting only relevant columns (every confidence is >0.7)
df_object = df.pivot_table(index='image_id', columns='data_amz_label', aggfunc=len, fill_value=0) ## Pivot count (looks like len)
df_object = df_object.reset_index()
df_object.head()

data_amz_label,image_id,Alcohol,Animal,Art,Beverage,Blossom,Bottle,Bowl,Brochure,Cake,...,Plant,Portrait,Poster,Potted Plant,Selfie,Smile,Text,Tree,Vehicle,Water
0,1000004599066965477_545497348,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000059425592054064_703978203,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,1000080765059521113_31736205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000097452173278518_206726006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000118559875482297_555477511,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
#####################################
## FACE --> EMO
#####################################
df = face_df[['image_id','face_id','face_emo','emo_confidence']] ## selecting the emotion's
emo_face =pd.pivot_table(df, index= ['image_id'],columns ='face_emo',values = 'emo_confidence').fillna(0) ## Gemiddelde emotie per foto
emo_face = emo_face.reset_index()
emo_face.head()


#####################################
## FACE --> PEOPLE en GENDER
#####################################
f = lambda x: x["image_id"].split("_")[1]  ## Maak functie aan

df_people_in_pic = face_df[['image_id','face_id']].drop_duplicates()
df_people_in_pic['user_id']= df_people_in_pic.apply(f, axis=1)
df_people_in_pic.drop(columns = 'image_id',axis=1,inplace=True)
df_people_in_pic = df_people_in_pic.drop_duplicates()

df_aantal_people_in_pic = df_people_in_pic.groupby(['user_id']).count().reset_index()
df_aantal_people_in_pic.columns = ['user_id', 'aantal_personen']
df_aantal_people_in_pic


df = face_df[['image_id','face_id','face_gender']].drop_duplicates()
df['user_id']= df.apply(f, axis=1)
df.drop(columns = 'image_id',axis=1,inplace=True)
df = df.drop_duplicates()



df = df[df['face_gender'] == 'Male']  ## Select only males
df = df.groupby('user_id').count().reset_index()
df.columns = ['user_id', 'aantal_mannen', 'mag_weg']
df_aantal_mannen_in_pic = df[['user_id', 'aantal_mannen']]
people = pd.merge(df_aantal_people_in_pic, df_aantal_mannen_in_pic, how='left', on='user_id').fillna(0)
people['aantal_vrouwen'] = people.aantal_personen - people.aantal_mannen 
people['ratio_mannen']  = people.aantal_mannen / people.aantal_personen 
people['ratio_vrouwen']  = people.aantal_vrouwen / people.aantal_personen 

people.head()



#####################################
## FACE --> PEOPLE en GENDER
#####################################
df = face_df[['image_id','face_id','face_age_range_high','face_age_range_low']].drop_duplicates()
df['gem_leeftijd'] = (df.face_age_range_high + df.face_age_range_low ) / 2 
df = df[['image_id','face_id','gem_leeftijd']]

def roundup(x):
    return str(int((x+5)/10)*10) 

df['nearest_decade_leeftijd'] = df.gem_leeftijd.apply(roundup)
df = df[['image_id','face_id','nearest_decade_leeftijd']]
leeftijd_mensen_in_foto = df.pivot_table(index='image_id', columns='nearest_decade_leeftijd', aggfunc=len, fill_value=0) ## Pivot count (looks like len)
leeftijd_mensen_in_foto = leeftijd_mensen_in_foto.reset_index()
leeftijd_mensen_in_foto.columns = ['_'.join(col) for col in leeftijd_mensen_in_foto.columns]
leeftijd_mensen_in_foto.rename(columns={'image_id_': 'image_id'}, inplace=True)
leeftijd_mensen_in_foto.head() 

Unnamed: 0,image_id,face_id_0,face_id_10,face_id_20,face_id_30,face_id_40,face_id_50,face_id_60,face_id_70,face_id_80
0,1000126179441391393_30837828,0,2,0,3,3,0,0,0,0
1,1000135251972767429_31447990,0,0,0,1,0,0,0,0,0
2,1000159024449036821_265063047,0,0,0,2,0,0,0,0,0
3,1000185226634350811_276232195,0,0,0,0,1,0,0,0,0
4,1000204070831242247_143854846,0,0,0,1,0,0,0,0,0


In [5]:
#####################################
## ANP --> ANP
#####################################
# anp_df.anp_label.value_counts() ## 3864 different lengts, laten we enkel pakken met count groter dan 500
df = anp_df[['image_id','anp_label','anp_sentiment']]
df_count = df.groupby("anp_label").count().reset_index() 
list_met_columns = df_count[df_count['image_id']>500].anp_label.tolist() ## een lijst met alle kollommen die we willen (81)

df = df[df.anp_label.isin(list_met_columns)] ## Filter alle labels die niet vaker voor komen dan 500 keer resulteert in 108k regels
anp = df.pivot_table(index='image_id', columns='anp_label', fill_value=0) ## Pivot mean is default
anp = anp.reset_index()
anp.columns = ['_'.join(col) for col in anp.columns]
anp.rename(columns={'image_id_': 'image_id'}, inplace=True)
anp.head()


#####################################
## ANP --> Sentiment
#####################################
# anp_df.emotion_label.value_counts() ## 24 different lengts
df = anp_df[['image_id','emotion_label','emotion_score']]
emotion_score = df.pivot_table(index='image_id', columns='emotion_label', fill_value=0) ## Pivot mean is default
emotion_score = emotion_score.reset_index()
emotion_score.columns = ['_'.join(col) for col in emotion_score.columns]
emotion_score.rename(columns={'image_id_': 'image_id'}, inplace=True)
emotion_score.head()




Unnamed: 0,image_id,emotion_score_acceptance,emotion_score_admiration,emotion_score_amazement,emotion_score_anger,emotion_score_annoyance,emotion_score_anticipation,emotion_score_apprehension,emotion_score_boredom,emotion_score_disgust,...,emotion_score_joy,emotion_score_loathing,emotion_score_pensiveness,emotion_score_rage,emotion_score_sadness,emotion_score_serenity,emotion_score_surprise,emotion_score_terror,emotion_score_trust,emotion_score_vigilance
0,1000004599066965477_545497348,0.0,0.0,0.196933,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1126,0.0,0.0,0.0,0.0,0.0
1,1000059425592054064_703978203,0.0,0.0,0.3468,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.11555,0.0,0.2026,0.0,0.0,0.0
2,1000065308809420330_46329534,0.0,0.0,0.171525,0.0,0.0987,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000080765059521113_31736205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.21565,0.0,0.0,0.0,0.1234,0.0,0.0,0.0,0.0,0.0
4,1000097452173278518_206726006,0.0,0.0,0.1981,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4013,0.0,0.0,0.0,0.09745,0.0,0.0,0.0,0.0,0.0


In [6]:
#####################################
## IMAGE
#####################################

# image_df.image_id.is_unique ## Image is unique
## DROP COLOMMEN DIE GEEN VOORSPELKRACHT HEBBEN
# image_df.drop(columns = ['image_id','image_link','image_url','user_full_name','user_name', 'user_website','user_profile_pic', 'user_bio'], inplace=True )

##FILTER PLAT SLAAN

# image_df.image_filter.value_counts() ## ongv. 30
df = image_df[['user_id','image_filter']]
# df_count = df.groupby("image_filter").count().reset_index() 
df_filter = df.pivot_table(index='user_id', columns='image_filter', aggfunc= len ,fill_value=0)
df_filter = df_filter.reset_index()
df_filter.columns = [str(col) + '_filter' for col in df_filter.columns]
df_filter.rename(columns={'user_id_filter': 'user_id'}, inplace=True)
## Waar kunnen we gemiddelde van nemen

df = image_df[['user_id','image_height','image_width','data_memorability']]

df_mean_vars = df.groupby('user_id').mean().reset_index()


df = image_df[['user_id','user_followed_by','user_follows','user_posted_photos']]
df['aantal_fotos'] = 1 
df_sum_vars = df.groupby('user_id').mean().reset_index() ## Moet ook mean zijn, want haalt niet uit

df_sum_vars.shape  #159
df_mean_vars.shape #159
df_filter.shape #159

temp_df = pd.merge(df_sum_vars, df_mean_vars, how='inner', on='user_id')
image_df_user = pd.merge(temp_df, df_filter, how='inner', on='user_id')
image_df_user.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user_id,user_followed_by,user_follows,user_posted_photos,aantal_fotos,image_height,image_width,data_memorability,1977_filter,Aden_filter,...,Slumber_filter,Stinson_filter,Sutro_filter,Toaster_filter,Unknown_filter,Valencia_filter,Vesper_filter,Walden_filter,Willow_filter,X-Pro II_filter
0,1097967773,53.0,154.0,282.0,1,636.730496,632.056738,0.843239,0,18,...,5,6,0,0,1,19,0,0,0,5
1,11520833,201.0,299.0,312.0,1,617.157051,617.166667,0.819178,2,0,...,0,0,9,3,1,10,0,3,0,9
2,1223036636,169.0,159.0,48.0,1,640.0,640.0,0.869481,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1224423305,1.0,18.0,41.0,1,432.365854,593.170732,0.790601,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1249457735,66.0,93.0,30.0,1,645.333333,640.0,0.804556,0,1,...,0,0,1,0,0,0,0,0,6,2


In [7]:
#####################################
## Metrix
#####################################
metrics_df['user_id']=metrics_df.apply(f, axis=1)
metrics_df = metrics_df.groupby('image_id').max() ## op een of andere manier werkt drop_duplicates niet vaagg...
metrics_df = metrics_df.reset_index()
df = metrics_df[['user_id','comment_count','like_count']]
metrics_df_user = df.groupby('user_id').mean().reset_index()
metrics_df_user.head()

Unnamed: 0,user_id,comment_count,like_count
0,1097967773,0.731183,5.892473
1,11520833,0.356557,5.540984
2,1223036636,1.978723,11.340426
3,1224423305,0.0,0.052632
4,1249457735,0.241379,5.689655


In [8]:
#####################################
## IMAGE
#####################################
# survey_df.info()

survey_df.id.is_unique ## Image is unique

True

In [9]:
#####################################
## CELEB
#####################################
# celebrity_df.image_id.is_unique
# celebrity_df.head(20)
# celebrity_df.face_celebrity_name.value_counts() #2177 counts
# GRootste is 20 en gaat heel snel naar 4/5 hier zit geen voorspelende waarde in!

In [10]:
#####################################
## Analyse op aantallen
#####################################

emo_face.shape#(15957, 7)   NIet elke foto heeft mensen erin
df_object.shape#(31051, 48) Niet elke foto heeft een getraind object, gelukkig wel veel gevonden
emotion_score.shape#(41292, 24) Voor paar fotos niet gevuld
leeftijd_mensen_in_foto.shape#(15957, 9)   NIet elke foto heeft mensen erin
# people.shape#(15957, 6)   NIet elke foto heeft mensen erin
metrics_df.shape#(35803, 4) ##RAAR dat dit niet voor elke gevuld is.... Denk dat de data die we hebben gekregen beetje corrupt is, zaten ook duplicates tussen
image_df.shape#(41206, 18) ## dit zijn alle fotos
anp.shape#(22377, 82) ## We hebben hierin gesneden dus dit kan kloppen

(22377, 82)

In [11]:
#####################################
## User id uit elke extracten
#####################################
f = lambda x: x["image_id"].split("_")[1]  ## Maak functie aan

emo_face['user_id'] = emo_face.apply(f, axis=1)
df_object['user_id'] =df_object.apply(f, axis=1)
emotion_score['user_id']= emotion_score.apply(f, axis=1)
leeftijd_mensen_in_foto['user_id']= leeftijd_mensen_in_foto.apply(f, axis=1)
anp['user_id']=anp.apply(f, axis=1)


In [12]:
#####################################
## Aggegreren op user_id
#####################################
emo_user_face =emo_face.loc[:, emo_face.columns != 'image_id'].groupby('user_id').mean().reset_index()
object_user = df_object.loc[:, df_object.columns != 'image_id'].groupby('user_id').sum().reset_index()
emo_user =emotion_score.loc[:, emotion_score.columns != 'image_id'].groupby('user_id').mean().reset_index()
leeftijd_mensen_in_foto_user = leeftijd_mensen_in_foto.loc[:, leeftijd_mensen_in_foto.columns != 'image_id'].groupby('user_id').sum().reset_index()
anp_user =anp.loc[:, anp.columns != 'image_id'].groupby('user_id').mean().reset_index()
# people ## deze is al op niveau van user
# image_df_user ## deze is ook al op niveau

In [13]:
emo_user['user_id'] = emo_user['user_id'].astype('int64')
object_user['user_id'] = object_user['user_id'].astype('int64')
leeftijd_mensen_in_foto_user['user_id'] = leeftijd_mensen_in_foto_user['user_id'].astype('int64')
anp_user['user_id'] = anp_user['user_id'].astype('int64')
people['user_id'] = people['user_id'].astype('int64')
image_df_user['user_id'] = image_df_user['user_id'].astype('int64')
metrics_df_user['user_id'] = metrics_df_user['user_id'].astype('int64')
emo_user_face['user_id'] = emo_user_face['user_id'].astype('int64')

temp = pd.merge(survey_df, emo_user, how='left', left_on='insta_user_id', right_on='user_id')
temp2 = pd.merge(temp, object_user, how='left', left_on='insta_user_id', right_on='user_id')
temp3 = pd.merge(temp2, leeftijd_mensen_in_foto_user, how='left', left_on='insta_user_id', right_on='user_id')
temp4 = pd.merge(temp3, anp_user, how='left', left_on='insta_user_id', right_on='user_id')
temp5 = pd.merge(temp4, people, how='left', left_on='insta_user_id', right_on='user_id')
temp6 = pd.merge(temp5, image_df_user, how='left', left_on='insta_user_id', right_on='user_id')
temp7 = pd.merge(temp6, metrics_df_user, how='left', left_on='insta_user_id', right_on='user_id')
platte_df = pd.merge(temp7, emo_user_face, how='left', left_on='insta_user_id', right_on='user_id')

platte_df.shape



Unnamed: 0,index,id,gender,born,education,employed,income,A_2,N_1,P_1,...,comment_count,like_count,user_id_y,ANGRY,CALM,CONFUSED,DISGUSTED,HAPPY,SAD,SURPRISED
0,0,920bf027f7d13dbdc7b66b3d3324903c,Male,1975,College graduate,Employed for wages,"$30,000 to $39,999",4,5,5,...,0.220472,3.590551,619868600.0,5.615921,3.951672,10.626774,1.330124,50.596171,9.203222,7.032123
1,1,b433b2bfe49e28d0b7c45925b53084e0,Male,1978,College graduate,Employed for wages,"$20,000 to $29,999",8,0,9,...,0.666667,19.5,187920300.0,4.132142,0.932478,0.90472,0.241971,79.140063,0.670505,2.205702
2,4,f4f54676f75f47c17dc434cf68845328,Female,1990,High school graduate,Employed for wages,"$80,000 to $89,999",7,3,8,...,1.509908,14.562748,33420910.0,4.030452,3.263048,4.08035,0.957494,64.020145,7.488966,13.042668
3,5,a27a5fc47a59f35761705330253a58e3,Male,1997,High school graduate,Employed for wages,"$20,000 to $29,999",7,5,7,...,0.575,4.275,2143581000.0,2.797384,5.104221,12.292257,0.0,37.738075,10.228524,11.990256
4,6,0a1002b2232a4ecbde604462f6d84bf9,Female,1993,College graduate,A student,"$20,000 to $29,999",7,5,7,...,0.684932,17.328767,263042300.0,4.683774,1.57594,5.049674,0.86768,78.760601,5.323909,4.480929


In [15]:
## Alle kolommmen bekijken
from IPython.display import display
pd.options.display.max_columns = None
display(platte_df.head())

Unnamed: 0,index,id,gender,born,education,employed,income,A_2,N_1,P_1,E_1,A_1,H_1,M_1,R_1,M_2,E_2,LON,H_2,P_2,N_2,A_3,N_3,E_3,H_3,R_2,M_3,R_3,P_3,HAP,participate,insta_user_id,completed,start_q,end_q,network_id,P,E,R,M,A,PERMA,N_EMO,P_EMO,imagecount,private_account,user_id_x,emotion_score_acceptance,emotion_score_admiration,emotion_score_amazement,emotion_score_anger,emotion_score_annoyance,emotion_score_anticipation,emotion_score_apprehension,emotion_score_boredom,emotion_score_disgust,emotion_score_distraction,emotion_score_ecstasy,emotion_score_fear,emotion_score_grief,emotion_score_interest,emotion_score_joy,emotion_score_loathing,emotion_score_pensiveness,emotion_score_rage,emotion_score_sadness,emotion_score_serenity,emotion_score_surprise,emotion_score_terror,emotion_score_trust,emotion_score_vigilance,user_id_y,Alcohol,Animal,Art,Beverage,Blossom,Bottle,Bowl,Brochure,Cake,Canine,Car,Cat,Clothing,Collage,Couch,Crowd,Cup,Dessert,Dog,Drink,Electronics,Face,Female,Flora,Flower,Flyer,Food,Furniture,Glass,Goggles,Human,Mammal,Musical Instrument,Outdoors,Paper,People,Person,Pet,Plant,Portrait,Poster,Potted Plant,Selfie,Smile,Text,Tree,Vehicle,Water,user_id_x.1,face_id_0,face_id_10,face_id_20,face_id_30,face_id_40,face_id_50,face_id_60,face_id_70,face_id_80,user_id_y.1,anp_sentiment_bad_hair,anp_sentiment_bad_sign,anp_sentiment_baked_goods,anp_sentiment_beautiful_baby,anp_sentiment_big_glasses,anp_sentiment_classic_toy,anp_sentiment_colorful_hair,anp_sentiment_comic_life,anp_sentiment_comic_sans,anp_sentiment_compact_disc,anp_sentiment_crazy_face,anp_sentiment_cute_baby,anp_sentiment_cute_cat,anp_sentiment_cute_couple,anp_sentiment_cute_dog,anp_sentiment_cute_guy,anp_sentiment_cute_kitty,anp_sentiment_dark_chocolate,anp_sentiment_double_exposition,anp_sentiment_double_portrait,anp_sentiment_dyed_hair,anp_sentiment_dynamic_light,anp_sentiment_early_days,anp_sentiment_fake_lomo,anp_sentiment_fake_vintage,anp_sentiment_fitting_room,anp_sentiment_frozen_food,anp_sentiment_funny_cats,anp_sentiment_funny_dog,anp_sentiment_funny_quotes,anp_sentiment_funny_signs,anp_sentiment_funny_stuff,anp_sentiment_good_day,anp_sentiment_good_food,anp_sentiment_grand_baby,anp_sentiment_grilled_chicken,anp_sentiment_grumpy_cat,anp_sentiment_hairless_cat,anp_sentiment_happy_baby,anp_sentiment_happy_birthday,anp_sentiment_happy_couple,anp_sentiment_healthy_food,anp_sentiment_hot_drink,anp_sentiment_hot_guys,anp_sentiment_hot_men,anp_sentiment_hot_sauce,anp_sentiment_hot_site,anp_sentiment_impossible_project,anp_sentiment_inspirational_quotes,anp_sentiment_inspiring_quotes,anp_sentiment_late_dinner,anp_sentiment_mad_magazine,anp_sentiment_mixed_breed,anp_sentiment_natural_hair,anp_sentiment_old_friends,anp_sentiment_plastic_bullet,anp_sentiment_plastic_surgery,anp_sentiment_raw_food,anp_sentiment_raw_milk,anp_sentiment_real_food,anp_sentiment_real_talk,anp_sentiment_rounded_corners,anp_sentiment_sexy_boy,anp_sentiment_sexy_man,anp_sentiment_short_sale,anp_sentiment_short_stories,anp_sentiment_silly_faces,anp_sentiment_sleepy_dog,anp_sentiment_small_dog,anp_sentiment_smiling_baby,anp_sentiment_sour_cream,anp_sentiment_sticky_notes,anp_sentiment_straight_hair,anp_sentiment_sweet_baby,anp_sentiment_sweet_tea,anp_sentiment_temporary_tattoos,anp_sentiment_true_story,anp_sentiment_ugly_sweater,anp_sentiment_visual_identity,anp_sentiment_visual_journal,anp_sentiment_wise_words,user_id_x.2,aantal_personen,aantal_mannen,aantal_vrouwen,ratio_mannen,ratio_vrouwen,user_id_y.2,user_followed_by,user_follows,user_posted_photos,aantal_fotos,image_height,image_width,data_memorability,1977_filter,Aden_filter,Amaro_filter,Apollo_filter,Ashby_filter,Brannan_filter,Brooklyn_filter,Charmes_filter,Clarendon_filter,Crema_filter,Dogpatch_filter,Earlybird_filter,Gingham_filter,Ginza_filter,Gotham_filter,Hefe_filter,Helena_filter,Hudson_filter,Inkwell_filter,Juno_filter,Kelvin_filter,Lark_filter,Lo-fi_filter,Ludwig_filter,Maven_filter,Mayfair_filter,Moon_filter,Nashville_filter,Normal_filter,Perpetua_filter,Poprocket_filter,Reyes_filter,Rise_filter,Sierra_filter,Skyline_filter,Slumber_filter,Stinson_filter,Sutro_filter,Toaster_filter,Unknown_filter,Valencia_filter,Vesper_filter,Walden_filter,Willow_filter,X-Pro II_filter,user_id_x.3,comment_count,like_count,user_id_y.3,ANGRY,CALM,CONFUSED,DISGUSTED,HAPPY,SAD,SURPRISED
0,0,920bf027f7d13dbdc7b66b3d3324903c,Male,1975,College graduate,Employed for wages,"$30,000 to $39,999",4,5,5,3,4,5,6,6,5,5,6,5,6,3,3,4,4,4,4,4,4,2,3.0,Yes,619868570,True,2016-12-05 14:01:26,2016-12-05 14:02:52,124bd2ceb8,4.333333,4.0,4.666667,5.0,3.666667,4.25,4.0,4.333333,465.0,public,619868570,0.000179,0.000179,0.165485,0.016453,0.000179,0.000432,0.000179,0.007084,0.00052,0.000179,0.002174,0.010112,0.004191,0.060328,0.080558,0.000179,0.001136,0.01451,0.0681,0.016096,0.005229,0.015892,0.010134,0.000179,619868600.0,0.0,13.0,6.0,2.0,1.0,2.0,14.0,56.0,1.0,7.0,7.0,3.0,5.0,11.0,14.0,0.0,9.0,1.0,7.0,2.0,12.0,5.0,2.0,1.0,1.0,68.0,3.0,21.0,9.0,12.0,168.0,11.0,98.0,28.0,9.0,168.0,180.0,8.0,15.0,23.0,74.0,12.0,20.0,1.0,40.0,4.0,33.0,14.0,619868600.0,2.0,3.0,15.0,70.0,75.0,19.0,21.0,4.0,6.0,619868600.0,-0.010818,-0.030909,0.0,0.012788,0.000498,0.000675,0.0,0.003351,0.003221,0.000312,-0.00797,0.004632,0.0,0.002468,0.004935,0.008636,0.002537,0.0,-5.6e-05,-5.6e-05,0.0,0.004104,-4.3e-05,-0.001961,-0.019784,0.003074,-0.001056,0.004052,0.003377,0.019922,0.002364,0.006753,0.004173,0.0,0.029,3.5e-05,-0.003403,0.0,0.014909,0.0,0.0,0.0,0.0,0.000515,0.000515,0.000221,0.0,-0.00045,0.15639,0.058442,0.0,-0.001571,0.0,-0.000113,0.002351,0.000359,0.000727,-0.000814,-0.000394,-0.000104,-0.00487,0.000403,0.007818,0.069506,-0.004636,-0.001974,-0.007965,-0.010528,-0.0029,0.009602,0.0,-0.000234,0.0,0.003848,0.002532,0.000203,0.059424,0.0,-0.000303,0.0,0.034727,619868600.0,95.0,80.0,15.0,0.842105,0.157895,619868570,612.0,1987.0,469.0,1,606.774194,611.32043,0.796987,0,1,4,0,0,1,0,0,37,3,0,1,20,0,0,2,0,2,2,7,0,7,1,8,0,1,8,0,347,0,0,0,1,1,0,1,0,2,0,2,0,0,0,3,3,619868600.0,0.220472,3.590551,619868600.0,5.615921,3.951672,10.626774,1.330124,50.596171,9.203222,7.032123
1,1,b433b2bfe49e28d0b7c45925b53084e0,Male,1978,College graduate,Employed for wages,"$20,000 to $29,999",8,0,9,7,7,8,8,9,8,9,0,9,9,0,8,0,6,8,9,8,9,9,9.0,Yes,187920333,True,2016-12-05 14:01:52,2016-12-05 14:03:07,2a0f882bcd,9.0,7.333333,9.0,8.0,7.666667,8.25,0.0,9.0,6.0,public,187920333,0.0,0.0,0.144367,0.019433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0417,0.206483,0.0,0.0,0.0,0.067425,0.0,0.0,0.0,0.0,0.0,187920300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,187920300.0,1.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,187920300.0,-0.119,0.0,0.0,0.246167,0.003833,0.0,0.0,0.0,0.0,0.0,0.0,0.178333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000833,0.0,0.0,0.0,0.0,0.0,0.0,0.013,0.0,0.026,0.0,0.0,0.1595,0.0,0.0,0.0,0.191333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.008667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.004167,0.0,0.0,0.0,0.0,0.0,-0.038333,0.0,0.0,0.369667,0.0,0.0,0.0,0.0,0.0,0.0,0.123667,0.0,0.0,0.0,0.0,187920300.0,7.0,2.0,5.0,0.285714,0.714286,187920333,68.0,170.0,6.0,1,630.666667,630.666667,0.859884,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,187920300.0,0.666667,19.5,187920300.0,4.132142,0.932478,0.90472,0.241971,79.140063,0.670505,2.205702
2,4,f4f54676f75f47c17dc434cf68845328,Female,1990,High school graduate,Employed for wages,"$80,000 to $89,999",7,3,8,7,7,7,8,8,8,8,3,8,8,3,8,4,7,7,8,7,7,8,7.0,Yes,33420910,True,2016-12-05 14:02:42,2016-12-05 14:03:52,f3ff34e4cf,8.0,7.333333,7.666667,7.666667,7.333333,7.5625,3.333333,8.0,767.0,public,33420910,0.001145,0.0,0.162936,0.023056,0.000129,0.0,0.0,0.004578,0.00031,0.0,0.009873,0.006486,0.002041,0.05944,0.096236,0.0,0.00016,0.004283,0.061758,0.02028,0.003931,0.006559,0.008099,0.0,33420910.0,33.0,32.0,8.0,43.0,6.0,35.0,37.0,16.0,19.0,18.0,10.0,6.0,6.0,17.0,19.0,5.0,30.0,45.0,19.0,42.0,9.0,52.0,6.0,4.0,6.0,24.0,108.0,30.0,9.0,6.0,237.0,25.0,0.0,26.0,7.0,237.0,237.0,24.0,50.0,55.0,36.0,28.0,8.0,51.0,21.0,22.0,15.0,11.0,33420910.0,41.0,29.0,25.0,180.0,56.0,20.0,11.0,0.0,2.0,33420910.0,-0.006447,-0.012894,-0.001016,0.070016,0.000519,0.001937,7e-06,0.00233,0.001679,0.000379,-0.004749,0.048307,0.002573,0.018014,0.01544,0.001287,0.001323,-0.004059,-0.000147,-0.000205,-0.001129,0.00856,-6.8e-05,-0.009203,-0.012379,0.004808,-0.008813,0.000176,0.004226,0.001409,0.001409,0.003874,0.008704,0.012935,0.071289,0.000614,-0.007984,-0.002275,0.075151,0.01814,0.005851,0.021255,0.001698,0.00023,7.7e-05,0.001228,0.000576,-0.000117,0.009061,0.0,-5.4e-05,-0.001366,-9.9e-05,-5.9e-05,0.006129,0.001312,0.00019,-0.00679,-0.001027,-0.001788,-0.000395,0.00035,0.008153,0.006041,-0.002072,-0.000343,-0.010384,-0.006176,-0.004537,0.055074,-0.051454,-0.000488,-0.000163,0.014047,0.007043,0.00053,0.005862,-0.006605,-0.000158,-0.000237,0.003018,33420910.0,141.0,75.0,66.0,0.531915,0.468085,33420910,271.0,247.0,769.0,1,636.625815,629.100391,0.809493,2,1,3,0,0,16,0,0,6,3,1,1,0,1,0,61,1,8,10,5,2,5,97,12,0,12,0,4,455,0,0,0,4,9,0,1,0,12,0,8,11,1,1,1,13,33420910.0,1.509908,14.562748,33420910.0,4.030452,3.263048,4.08035,0.957494,64.020145,7.488966,13.042668
3,5,a27a5fc47a59f35761705330253a58e3,Male,1997,High school graduate,Employed for wages,"$20,000 to $29,999",7,5,7,4,7,6,7,4,6,4,6,8,6,7,3,6,6,6,4,7,3,5,6.0,Yes,2143580844,True,2016-12-05 14:02:54,2016-12-05 14:04:08,a48e51bacd,6.0,4.666667,3.666667,6.666667,5.666667,5.375,6.0,6.0,43.0,public,2143580844,0.0,0.0,0.150024,0.029102,0.0,0.0,0.0,0.006609,0.0,0.0,0.0101,0.008334,0.005233,0.051228,0.040426,0.0,0.0,0.013472,0.077951,0.008584,0.00733,0.010144,0.017384,0.0,2143581000.0,0.0,8.0,2.0,2.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,7.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,0.0,0.0,10.0,7.0,0.0,0.0,3.0,10.0,10.0,7.0,1.0,1.0,4.0,0.0,1.0,0.0,3.0,1.0,0.0,0.0,2143581000.0,0.0,0.0,1.0,2.0,3.0,4.0,0.0,0.0,2.0,2143581000.0,-0.01275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008857,0.000857,-0.009393,0.0,0.061071,0.020357,0.0,0.0,0.083714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.032643,0.0,-0.008714,0.008357,0.0,0.005571,0.002786,0.005571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000607,0.000607,0.000607,0.0,-0.001857,0.015929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.006714,0.0,-0.000857,0.0,0.0,0.0,0.031857,-0.010929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0025,-0.0025,0.0,2143581000.0,12.0,10.0,2.0,0.833333,0.166667,2143580844,13.0,13.0,43.0,1,606.116279,602.139535,0.798614,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,32,1,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,2143581000.0,0.575,4.275,2143581000.0,2.797384,5.104221,12.292257,0.0,37.738075,10.228524,11.990256
4,6,0a1002b2232a4ecbde604462f6d84bf9,Female,1993,College graduate,A student,"$20,000 to $29,999",7,5,7,4,6,7,7,7,6,7,5,6,2,2,8,2,7,6,6,7,5,8,,Yes,263042348,True,2016-12-05 14:02:36,2016-12-05 14:04:43,aae4bbb89a,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public,263042348,0.003263,0.0,0.17027,0.017107,0.0,0.0,0.0,0.002404,0.001027,0.0,0.0,0.00496,0.006137,0.058268,0.101779,0.0,0.0,0.005789,0.046979,0.008362,0.007489,0.010326,0.007973,0.0,263042300.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,4.0,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,2.0,0.0,2.0,2.0,1.0,1.0,0.0,2.0,47.0,1.0,1.0,0.0,2.0,47.0,47.0,0.0,2.0,18.0,11.0,1.0,3.0,18.0,3.0,2.0,4.0,1.0,263042300.0,0.0,6.0,25.0,98.0,22.0,2.0,1.0,0.0,0.0,263042300.0,-0.007933,0.0,0.0,0.032822,0.001533,0.0,0.0,0.0172,0.0,0.0,-0.005844,0.0,0.012667,0.025333,0.0,0.019,0.0,-0.001378,-0.000289,-0.000867,0.0,0.0,0.0,-0.010067,-0.040622,0.023667,0.0,0.0,0.0,0.001733,0.001733,0.0,0.0,0.021222,0.021267,0.000178,0.0,0.0,0.051022,0.0,0.0,0.019022,0.0,0.000756,0.0,0.0,0.000378,0.0,0.009911,0.008333,-0.000133,-0.002689,0.0,-0.000578,0.0,0.0,0.001867,-0.008356,-0.002022,-0.002133,-0.000556,0.001378,0.020067,0.019822,-0.0034,-0.003378,-0.040889,0.0,0.0,0.0,-0.017467,-0.0012,0.0,0.0,0.004333,0.0,0.041222,-0.009289,-0.003111,0.0,0.004244,263042300.0,84.0,14.0,70.0,0.166667,0.833333,263042348,316.0,347.0,73.0,1,628.109589,628.109589,0.822348,0,1,13,0,0,0,0,0,0,0,0,2,2,0,0,0,0,2,0,0,0,1,2,0,1,10,0,0,23,0,0,1,4,2,0,1,0,1,1,0,2,3,0,0,1,263042300.0,0.684932,17.328767,263042300.0,4.683774,1.57594,5.049674,0.86768,78.760601,5.323909,4.480929


In [16]:
platte_df.shape

(161, 282)

## Einde data aggregatie

In [None]:
# Merge them based on the image_id so that we have a large data frame containing all the elements

image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
data1 = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
data = pd.merge(data1, metrics_df,how='inner', on='image_id' )

In [None]:
##Visualize the data
pd.options.display.max_rows  ## This is for showing all the text in the column (otherwise it will be half....)
pd.set_option('display.max_colwidth', -1)
data.head()

In [None]:
data.shape ## Koppeling is goed

In [None]:
metrics_df.shape

In [None]:
survey_df.shape

In [None]:
anp_df.shape

In [None]:
face_df.shape

In [None]:
image_df.shape

In [None]:
object_labels_df.shape

## We hebben 2 datasets nu data & survey_df

In [None]:
data.head()

In [None]:
data.columns.values.tolist()
data.user_id

In [None]:
# object_labels_df.groupby("image_id").count() ## inner voor object_labels is goed ( niet uniek op image_id)
# image_df.groupby("image_id").count() ## inner moeten we gebruiken ( deze is uniek op image_id)
# face_df.groupby("image_id").count() ## inner voor face_df ( niet uniek op image _id)
# anp_df.groupby('image_id').count() ## inner voor face_df ( niet uniek op image _id)
# metrics_df.groupby('image_id').count() niet uniek inner join
survey_df

In [None]:
survey_df.columns.values.tolist()
survey_df.insta_user_id[survey_df.insta_user_id == 263042348]

In [None]:
survey_df = survey_df.drop_duplicates(subset='insta_user_id', keep="first")

## First EDA and cleansing

In [None]:
## Overzichtelijk wat onze attributen zijn
data.info()

## Export data

In [None]:
#data.to_pickle('DFdata.pkl')

In [None]:
#survey_df.to_pickle('DFsurvey.pkl')

In [None]:
#print (pd.merge(data, survey_df, left_on='user_id', right_on='insta_user_id', how='left').drop('insta_user_id', axis=1))
survey_df['insta_user_id']=survey_df['insta_user_id'].apply(int)
data['user_id']=data['user_id'].apply(int)

In [None]:
permaMeta = pd.merge(data, survey_df, left_on='user_id', right_on='insta_user_id', how='outer')

In [None]:
permaMeta

In [None]:
# merged preprocessed data set
permaMeta = permaMeta[permaMeta['image_height'].isnull() == False]

In [None]:
permaMeta.info()

## Training

In [None]:
import sklearn

In [None]:
permaMeta = permaMeta.drop_duplicates(subset=None, keep='first', inplace=False)

In [None]:
#permaMeta["participate"] = permaMeta["participate"].cat.codes
columns = permaMeta.columns
objectsCol = permaMeta.select_dtypes(include=['object']).columns

for column in columns:
    if column not in ['image_id', 'image_link', 'image_url', 'user_full_name', 'user_name', 'user_bio']:
        try:
            permaMeta[column] = permaMeta[column].cat.codes
        except:
            continue
        
for Object in objectsCol:
    try:
        permaMeta[Object] = permaMeta[Object].astype('category').cat.codes
    except:
        continue
        