In [220]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from pprint import pprint
import scipy as sp

In [221]:
PATH = '../'
images = pd.read_csv(PATH + "image_frame.csv", sep= ";")

survey = pd.read_csv(PATH + "survey_frame.csv", delimiter= ";")

both= pd.merge(left=images, right= survey, on=None, left_on= "user_id", right_on="insta_user_id")

In [222]:
selfies_columns = ['PERMA',
                   'image_id',
                   'face_sunglasses', 
                   'face_beard', 
                   'face_mustache',
                   'eyeglasses', 
                   'user_followed_by', 
                   'user_follows',
                   'user_posted_photos',
                   'insta_user_id',
                   'gender',
                   'born',
                   'emotion_score',
                   'education',
                   'income']

In [223]:
income_dict = {
    'Less than $10,000': 5000,
    '$10,000 to $19,999': 15000,
    '$20,000 to $29,999': 25000,
    '$30,000 to $39,999': 35000,
    '$40,000 to $49,999': 45000,
    '$50,000 to $59,999': 55000,
    '$60,000 to $69,999': 65000,
    '$70,000 to $79,999': 75000,
    '$80,000 to $89,999': 85000,
    '$90,000 to $99,999': 95000,
    '$100,000 to $149,999': 125000,
    '$150,000 or more': 150000
}

In [224]:
selfies = both[both['data_amz_label'] == 'Selfie'][selfies_columns]

selfies = selfies[selfies['income'] != 'I\'d rather not disclose this information']

selfies['female'] = np.where(selfies['gender'] == 'Female', True, False)

selfies['college'] = np.where(selfies['education'] == 'College graduate', True, False)
selfies['high_school'] = np.where(selfies['education'] == 'High school graduate', True, False)
selfies['post_graduate'] = np.where(selfies['education'] == 'Post graduate degree', True, False)

selfies['income'] = selfies.apply(lambda row: income_dict[row['income']], axis=1)
del selfies['gender']
del selfies['education']
selfies.drop_duplicates(inplace=True)

In [225]:
selfies = selfies.groupby('image_id', as_index=False).agg({
    'emotion_score': 'mean',
    'face_sunglasses': 'max',
    'face_beard': 'max',
    'face_mustache': 'max',
    'eyeglasses': 'max',
    'PERMA': 'max',
    'user_followed_by': 'max',
    'user_follows': 'max',
    'user_posted_photos': 'max',
    'insta_user_id': 'max',
    'born': 'max',
    'income': 'max',
    'female': 'max',
    'college': 'max',
    'high_school': 'max',
    'post_graduate': 'max'
})

In [226]:
selfies.to_csv('selfies_frame.csv', sep=';', encoding='utf-8')

In [227]:
users = selfies.groupby('insta_user_id', as_index=False).agg({
    'face_sunglasses': 'mean',
    'face_beard': 'mean',
    'face_mustache': 'mean',
    'eyeglasses': 'mean',
    'image_id': 'count',
    'PERMA': 'max',
    'user_followed_by': 'max',
    'user_follows': 'max',
    'user_posted_photos': 'max',
    'born': 'max',
    'income': 'max',
    'female': 'max',
    'college': 'max',
    'high_school': 'max',
    'post_graduate': 'max',
    'emotion_score': 'mean'
})

In [228]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 90
Data columns (total 17 columns):
insta_user_id         91 non-null int64
face_sunglasses       91 non-null float64
user_posted_photos    91 non-null float64
born                  91 non-null int64
face_mustache         91 non-null float64
post_graduate         91 non-null bool
user_followed_by      91 non-null float64
face_beard            91 non-null float64
image_id              91 non-null int64
eyeglasses            91 non-null float64
college               91 non-null bool
female                91 non-null bool
income                91 non-null int64
emotion_score         91 non-null float64
PERMA                 91 non-null int64
high_school           91 non-null bool
user_follows          91 non-null float64
dtypes: bool(4), float64(8), int64(5)
memory usage: 10.3 KB


In [237]:
users['user_follows'] = users['user_follows'].astype(int)
users['user_followed_by'] = users['user_followed_by'].astype(int)
users['user_posted_photos'] = users['user_posted_photos'].astype(int)
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 90
Data columns (total 17 columns):
insta_user_id         91 non-null int64
face_sunglasses       91 non-null float64
user_posted_photos    91 non-null int64
born                  91 non-null int64
face_mustache         91 non-null float64
post_graduate         91 non-null bool
user_followed_by      91 non-null int64
face_beard            91 non-null float64
image_id              91 non-null int64
eyeglasses            91 non-null float64
college               91 non-null bool
female                91 non-null bool
income                91 non-null int64
emotion_score         91 non-null float64
PERMA                 91 non-null int64
high_school           91 non-null bool
user_follows          91 non-null int64
dtypes: bool(4), float64(5), int64(8)
memory usage: 10.3 KB


In [238]:
users.to_csv('users_frame.csv', sep=';', encoding='utf-8')