## Reading in the data

In [1]:
## Import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

import math



In [2]:
anp_df = pd.read_pickle(r'anp.pickle') #feather.read_dataframe('data_science_case/anp.feather')
face_df = pd.read_pickle(r'face.pickle') #feather.read_dataframe('data_science_case/face.feather')
image_df = pd.read_pickle(r'image_data.pickle') #feather.read_dataframe('data_science_case/image_data.feather')
metrics_df = pd.read_pickle(r'image_metrics.pickle') #feather.read_dataframe('data_science_case/image_metrics.feather')
object_labels_df = pd.read_pickle(r'object_labels.pickle') #feather.read_dataframe('data_science_case/object_labels.feather')
survey_df = pd.read_pickle(r'survey.pickle') #feather.read_dataframe('data_science_case/survey.feather')
celebrity_df = pd.read_pickle(r'celebrity.pickle') #feather.read_dataframe('data_science_case/survey.feather')


## Data aggregatie

In [108]:
#####################################
## OBJECTS DATA
#####################################
df = object_labels_df.groupby("data_amz_label").count().reset_index() ## 2101 possible objects , dit is te veel
df[df['image_id']>500].reset_index() ## 48 objecten hebben meer dan 500 observaties, laten we 48 kolommen maken
list_met_columns = df[df['image_id']>500].data_amz_label.tolist() ## een lijst met alle kollommen die we willen
df = object_labels_df[object_labels_df.data_amz_label.isin(list_met_columns)] ## Filter alle labels die niet vaker voor komen dan 500 keer resulteert in 108k regels
df = df[['image_id','data_amz_label']] ## selecting only relevant columns (every confidence is >0.7)
df_object = df.pivot_table(index='image_id', columns='data_amz_label', aggfunc=len, fill_value=0) ## Pivot count (looks like len)
df_object = df_object.reset_index()
df_object.head()

data_amz_label,image_id,Alcohol,Animal,Art,Beverage,Blossom,Bottle,Bowl,Brochure,Cake,...,Plant,Portrait,Poster,Potted Plant,Selfie,Smile,Text,Tree,Vehicle,Water
0,1000004599066965477_545497348,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000059425592054064_703978203,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,1000080765059521113_31736205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000097452173278518_206726006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000118559875482297_555477511,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [115]:
#####################################
## FACE --> EMO
#####################################
df = face_df[['image_id','face_id','face_emo','emo_confidence']] ## selecting the emotion's
emo_face =pd.pivot_table(df, index= ['image_id'],columns ='face_emo',values = 'emo_confidence').fillna(0) ## Gemiddelde emotie per foto
emo_face = emo_face.reset_index()
emo_face.head()


#####################################
## FACE --> PEOPLE en GENDER
#####################################
df_people_in_pic = face_df[['image_id','face_id']].drop_duplicates()
df_aantal_people_in_pic = df_people_in_pic.groupby(['image_id']).count().reset_index()
df_aantal_people_in_pic.columns = ['image_id', 'aantal_personen']
df_aantal_people_in_pic
df = face_df[['image_id','face_id','face_gender']].drop_duplicates()
df = df[df['face_gender'] == 'Male']  ## Select only males
df = df.groupby('image_id').count().reset_index()
df.columns = ['image_id', 'aantal_mannen', 'mag_weg']
df_aantal_mannen_in_pic = df[['image_id', 'aantal_mannen']]
people = pd.merge(df_aantal_people_in_pic, df_aantal_mannen_in_pic, how='left', on='image_id').fillna(0)
people['aantal_vrouwen'] = people.aantal_personen - people.aantal_mannen 
people['ratio_mannen']  = people.aantal_mannen / people.aantal_personen 
people['ratio_vrouwen']  = people.aantal_vrouwen / people.aantal_personen 

people.head()



#####################################
## FACE --> PEOPLE en GENDER
#####################################
df = face_df[['image_id','face_id','face_age_range_high','face_age_range_low']].drop_duplicates()
df['gem_leeftijd'] = (df.face_age_range_high + df.face_age_range_low ) / 2 
df = df[['image_id','face_id','gem_leeftijd']]

def roundup(x):
    return str(int((x+5)/10)*10) 

df['nearest_decade_leeftijd'] = df.gem_leeftijd.apply(roundup)
df = df[['image_id','face_id','nearest_decade_leeftijd']]
leeftijd_mensen_in_foto = df.pivot_table(index='image_id', columns='nearest_decade_leeftijd', aggfunc=len, fill_value=0) ## Pivot count (looks like len)
leeftijd_mensen_in_foto = leeftijd_mensen_in_foto.reset_index()
leeftijd_mensen_in_foto.columns = ['_'.join(col) for col in leeftijd_mensen_in_foto.columns]
leeftijd_mensen_in_foto.rename(columns={'image_id_': 'image_id'}, inplace=True)
leeftijd_mensen_in_foto.head() 

Unnamed: 0,image_id,face_id_0,face_id_10,face_id_20,face_id_30,face_id_40,face_id_50,face_id_60,face_id_70,face_id_80
0,1000126179441391393_30837828,0,2,0,3,3,0,0,0,0
1,1000135251972767429_31447990,0,0,0,1,0,0,0,0,0
2,1000159024449036821_265063047,0,0,0,2,0,0,0,0,0
3,1000185226634350811_276232195,0,0,0,0,1,0,0,0,0
4,1000204070831242247_143854846,0,0,0,1,0,0,0,0,0


face_emo,image_id,ANGRY,CALM,CONFUSED,DISGUSTED,HAPPY,SAD,SURPRISED
0,1000126179441391393_30837828,2.69576,2.688435,1.401609,0.0,93.731052,4.116577,4.390028
1,1000135251972767429_31447990,0.0,0.0,0.0,0.0,98.248589,0.289029,20.180923
2,1000159024449036821_265063047,0.0,0.0,13.949105,0.0,5.337354,0.0,47.158062
3,1000185226634350811_276232195,0.0,0.0,4.370034,6.414753,0.0,0.0,24.538427
4,1000204070831242247_143854846,0.0,0.0,10.848862,0.0,93.67659,26.206802,0.0


In [54]:
#####################################
## ANP --> ANP
#####################################
# anp_df.anp_label.value_counts() ## 3864 different lengts, laten we enkel pakken met count groter dan 500
df = anp_df[['image_id','anp_label','anp_sentiment']]
df_count = df.groupby("anp_label").count().reset_index() 
list_met_columns = df_count[df_count['image_id']>500].anp_label.tolist() ## een lijst met alle kollommen die we willen (81)

df = df[df.anp_label.isin(list_met_columns)] ## Filter alle labels die niet vaker voor komen dan 500 keer resulteert in 108k regels
anp = df.pivot_table(index='image_id', columns='anp_label', fill_value=0) ## Pivot mean is default
anp = anp.reset_index()
anp.columns = ['_'.join(col) for col in anp.columns]
anp.rename(columns={'image_id_': 'image_id'}, inplace=True)
anp.head()


#####################################
## ANP --> Sentiment
#####################################
# anp_df.emotion_label.value_counts() ## 24 different lengts
df = anp_df[['image_id','emotion_label','emotion_score']]
emotion_score = df.pivot_table(index='image_id', columns='emotion_label', fill_value=0) ## Pivot mean is default
emotion_score = emotion_score.reset_index()
emotion_score.columns = ['_'.join(col) for col in emotion_score.columns]
emotion_score.rename(columns={'image_id_': 'image_id'}, inplace=True)
emotion_score.head()




Unnamed: 0,image_id,emotion_score_acceptance,emotion_score_admiration,emotion_score_amazement,emotion_score_anger,emotion_score_annoyance,emotion_score_anticipation,emotion_score_apprehension,emotion_score_boredom,emotion_score_disgust,...,emotion_score_joy,emotion_score_loathing,emotion_score_pensiveness,emotion_score_rage,emotion_score_sadness,emotion_score_serenity,emotion_score_surprise,emotion_score_terror,emotion_score_trust,emotion_score_vigilance
0,1000004599066965477_545497348,0.0,0.0,0.196933,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1126,0.0,0.0,0.0,0.0,0.0
1,1000059425592054064_703978203,0.0,0.0,0.3468,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.11555,0.0,0.2026,0.0,0.0,0.0
2,1000065308809420330_46329534,0.0,0.0,0.171525,0.0,0.0987,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000080765059521113_31736205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.21565,0.0,0.0,0.0,0.1234,0.0,0.0,0.0,0.0,0.0
4,1000097452173278518_206726006,0.0,0.0,0.1981,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4013,0.0,0.0,0.0,0.09745,0.0,0.0,0.0,0.0,0.0


In [6]:
#####################################
## IMAGE
#####################################

image_df.image_id.is_unique ## Image is unique

True

In [113]:
#####################################
## Metrix
#####################################
metrics_df = metrics_df.groupby('image_id').max() ## op een of andere manier werkt drop_duplicates niet vaagg...
metrics_df = metrics_df.reset_index()
metrics_df.head()

Unnamed: 0,image_id,comment_count,comment_count_time_created,like_count,like_count_time_created
0,1000004599066965477_545497348,4.0,20-06-2017 00:36:18,43.0,20-06-2017 00:36:18
1,1000059425592054064_703978203,1.0,19-06-2017 23:32:56,8.0,19-06-2017 23:32:56
2,1000065308809420330_46329534,0.0,19-06-2017 23:18:22,8.0,19-06-2017 23:18:22
3,1000080765059521113_31736205,1.0,19-06-2017 21:01:13,13.0,19-06-2017 21:01:13
4,1000097452173278518_206726006,2.0,19-06-2017 22:35:11,27.0,19-06-2017 22:35:11


In [8]:
#####################################
## IMAGE
#####################################
# survey_df.info()

survey_df.id.is_unique ## Image is unique

True

In [None]:
#####################################
## CELEB
#####################################
# celebrity_df.image_id.is_unique
# celebrity_df.head(20)
# celebrity_df.face_celebrity_name.value_counts() #2177 counts
# GRootste is 20 en gaat heel snel naar 4/5 hier zit geen voorspelende waarde in!

In [15]:
#####################################
## Analyse op aantallen
#####################################

emo_face.shape#(15957, 7)   NIet elke foto heeft mensen erin
df_object.shape#(31051, 48) Niet elke foto heeft een getraind object, gelukkig wel veel gevonden
emotion_score.shape#(41292, 24) Voor paar fotos niet gevuld
leeftijd_mensen_in_foto.shape#(15957, 9)   NIet elke foto heeft mensen erin
people.shape#(15957, 6)   NIet elke foto heeft mensen erin
metrics_df.shape#(35803, 4) ##RAAR dat dit niet voor elke gevuld is.... Denk dat de data die we hebben gekregen beetje corrupt is, zaten ook duplicates tussen
image_df.shape#(41206, 18) ## dit zijn alle fotos
anp.shape#(22377, 82) ## We hebben hierin gesneden dus dit kan kloppen

(41206, 18)

In [126]:
#####################################
## User id uit elke extracten
#####################################
f = lambda x: x["image_id"].split("_")[1]  ## Maak functie aan


emo_face['user_id'] = emo_face.apply(f, axis=1)
df_object['user_id'] =df_object.apply(f, axis=1)
emotion_score['user_id']= emotion_score.apply(f, axis=1)
leeftijd_mensen_in_foto['user_id']= leeftijd_mensen_in_foto.apply(f, axis=1)
people['user_id']= people.apply(f, axis=1)
metrics_df['user_id']=metrics_df.apply(f, axis=1)
image_df['user_id']=image_df.apply(f, axis=1)
anp['user_id']=anp.apply(f, axis=1)


In [146]:
#####################################
## Aggegreren op user_id
#####################################
emo_user =emo_face.loc[:, emo_face != 'image_id'].groupby('user_id').mean()
object_user = df_object.loc[:, df_object.columns != 'image_id'].groupby('user_id').sum()
emo_user =emotion_score.loc[:, emotion_score.columns != 'image_id'].groupby('user_id').mean()
leeftijd_mensen_in_foto_user = leeftijd_mensen_in_foto.loc[:, leeftijd_mensen_in_foto.columns != 'image_id'].groupby('user_id').sum()
anp_user =anp.loc[:, anp.columns != 'image_id'].groupby('user_id').mean()

In [135]:
#####################TO DO###########################


- emo user kan maar gekoppeld worden aan 24 user_ids
-leeftijd_mensen_in_foto_user ratio in plaats van sum


People , image_df, metrics_df



data_amz_label,Alcohol,Animal,Art,Beverage,Blossom,Bottle,Bowl,Brochure,Cake,Canine,...,Portrait,Poster,Potted Plant,Selfie,Smile,Text,Tree,Vehicle,Water,user_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,545497348
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,703978203
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31736205
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,206726006
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,555477511
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30837828
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31447990
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,265063047
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,276232195
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1097967773


## Einde data aggregatie

In [None]:
# Merge them based on the image_id so that we have a large data frame containing all the elements

image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
data1 = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
data = pd.merge(data1, metrics_df,how='inner', on='image_id' )

In [None]:
##Visualize the data
pd.options.display.max_rows  ## This is for showing all the text in the column (otherwise it will be half....)
pd.set_option('display.max_colwidth', -1)
data.head()

In [None]:
data.shape ## Koppeling is goed

In [None]:
metrics_df.shape

In [None]:
survey_df.shape

In [None]:
anp_df.shape

In [None]:
face_df.shape

In [None]:
image_df.shape

In [None]:
object_labels_df.shape

## We hebben 2 datasets nu data & survey_df

In [None]:
data.head()

In [None]:
data.columns.values.tolist()
data.user_id

In [None]:
# object_labels_df.groupby("image_id").count() ## inner voor object_labels is goed ( niet uniek op image_id)
# image_df.groupby("image_id").count() ## inner moeten we gebruiken ( deze is uniek op image_id)
# face_df.groupby("image_id").count() ## inner voor face_df ( niet uniek op image _id)
# anp_df.groupby('image_id').count() ## inner voor face_df ( niet uniek op image _id)
# metrics_df.groupby('image_id').count() niet uniek inner join
survey_df

In [None]:
survey_df.columns.values.tolist()
survey_df.insta_user_id[survey_df.insta_user_id == 263042348]

In [None]:
survey_df = survey_df.drop_duplicates(subset='insta_user_id', keep="first")

## First EDA and cleansing

In [None]:
## Overzichtelijk wat onze attributen zijn
data.info()

## Export data

In [None]:
#data.to_pickle('DFdata.pkl')

In [None]:
#survey_df.to_pickle('DFsurvey.pkl')

In [None]:
#print (pd.merge(data, survey_df, left_on='user_id', right_on='insta_user_id', how='left').drop('insta_user_id', axis=1))
survey_df['insta_user_id']=survey_df['insta_user_id'].apply(int)
data['user_id']=data['user_id'].apply(int)

In [None]:
permaMeta = pd.merge(data, survey_df, left_on='user_id', right_on='insta_user_id', how='outer')

In [None]:
permaMeta

In [None]:
# merged preprocessed data set
permaMeta = permaMeta[permaMeta['image_height'].isnull() == False]

In [None]:
permaMeta.info()

## Training

In [None]:
import sklearn

In [None]:
permaMeta = permaMeta.drop_duplicates(subset=None, keep='first', inplace=False)

In [None]:
#permaMeta["participate"] = permaMeta["participate"].cat.codes
columns = permaMeta.columns
objectsCol = permaMeta.select_dtypes(include=['object']).columns

for column in columns:
    if column not in ['image_id', 'image_link', 'image_url', 'user_full_name', 'user_name', 'user_bio']:
        try:
            permaMeta[column] = permaMeta[column].cat.codes
        except:
            continue
        
for Object in objectsCol:
    try:
        permaMeta[Object] = permaMeta[Object].astype('category').cat.codes
    except:
        continue
        