In [1]:
# Dependencies and Setup
%matplotlib inline
%config InlineBackend.figure_format='svg'
from IPython.display import display,HTML
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from prettypandas import PrettyPandas
sns.set_style("ticks")
sns.set_context(context="notebook",font_scale=1)

In [2]:
# Read OKCupid Data file
df = pd.read_csv("profiles.csv")
print("The dataset contains {} records".format(len(df)))

# Show dataframe and print dataframe info
males = df[df["sex"]=="m"] # male users
females = df[df["sex"]=="f"] # female users
print("{} males ({:.1%}), {} females ({:.1%})".format(
    len(males),len(males)/len(df),
    len(females),len(females)/len(df)))

df = df.drop(['essay0', 'essay1', 'essay2', 
         'essay3', 'essay4', 'essay4', 
         'essay5', 'essay6', 'essay7', 
         'essay8', 'essay9'], axis=1)
df.head()

The dataset contains 59946 records
35829 males (59.8%), 24117 females (40.2%)


Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,transportation,...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,,graduated from masters program,,68.0,-1,,...,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,,working on college/university,white,71.0,20000,student,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,,socially,never,graduated from college/university,"asian, black, other",66.0,-1,artistic / musical / writer,...,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [3]:
# Changed all -1 values in income to 0
df = df.replace({-1: None})
df.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,,transportation,...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000.0,hospitality / travel,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,,graduated from masters program,,68.0,,,...,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,,working on college/university,white,71.0,20000.0,student,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,,socially,never,graduated from college/university,"asian, black, other",66.0,,artistic / musical / writer,...,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [4]:
# Create dataframe that gets rid of rows where income is not reported
income = df[df.income.notna()]
income.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
3,23,thin,vegetarian,socially,,working on college/university,white,71.0,20000,student,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
11,28,average,mostly anything,socially,never,graduated from college/university,white,72.0,40000,banking / financial / real estate,...,"daly city, california",,straight,likes cats,christianity and very serious about it,m,leo but it doesn&rsquo;t matter,no,"english (fluently), sign language (poorly)",seeing someone
13,30,skinny,mostly anything,socially,never,graduated from high school,white,66.0,30000,sales / marketing / biz dev,...,"san francisco, california",,straight,has dogs and likes cats,christianity but not too serious about it,f,,no,english,single
14,29,thin,mostly anything,socially,never,working on college/university,"hispanic / latin, white",62.0,50000,other,...,"san leandro, california","doesn&rsquo;t have kids, but wants them",straight,likes dogs and has cats,catholicism,f,taurus,no,english,single


# Religion & Job/Occupation: Cephra

In [5]:
# Create database for religion analysis
religion_df = df
religion_df['religion'].unique()

array(['agnosticism and very serious about it',
       'agnosticism but not too serious about it', nan, 'atheism',
       'christianity', 'christianity but not too serious about it',
       'atheism and laughing about it',
       'christianity and very serious about it', 'other', 'catholicism',
       'catholicism but not too serious about it',
       'catholicism and somewhat serious about it',
       'agnosticism and somewhat serious about it',
       'catholicism and laughing about it',
       'agnosticism and laughing about it', 'agnosticism',
       'atheism and somewhat serious about it',
       'buddhism but not too serious about it',
       'other but not too serious about it', 'buddhism',
       'other and laughing about it',
       'judaism but not too serious about it',
       'buddhism and laughing about it',
       'other and somewhat serious about it',
       'other and very serious about it',
       'hinduism but not too serious about it',
       'atheism but not too ser

In [6]:
# Clean up religion column
rel_vals_to_replace = {'agnosticism and very serious about it': 'agnosticism', 
                   'agnosticism but not too serious about it': 'agnosticism',
                   'agnosticism and somewhat serious about it': 'agnosticism',
                   'agnosticism and laughing about it': 'agnosticism',
                   'atheism and laughing about it': 'atheism',
                   'atheism and somewhat serious about it': 'atheism',
                   'atheism but not too serious about it': 'atheism',
                   'atheism and very serious about it': 'atheism',
                   'buddhism but not too serious about it': 'buddhism',
                   'buddhism and somewhat serious about it': 'buddhism',
                   'buddhism and very serious about it': 'buddhism',
                   'buddhism and laughing about it': 'buddhism',
                   'christianity and very serious about it': 'christianity',
                   'christianity but not too serious about it': 'christianity',
                   'christianity and somewhat serious about it': 'christianity',
                   'christianity and laughing about it': 'christianity',
                   'catholicism but not too serious about it': 'catholicism',
                   'catholicism and somewhat serious about it': 'catholicism',
                   'catholicism and very serious about it': 'catholicism',
                   'catholicism and laughing about it': 'catholicism',
                   'other and somewhat serious about it': 'other',
                   'other and very serious about it': 'other',
                   'other but not too serious about it': 'other',
                   'other and laughing about it': 'other',
                   'judaism and laughing about it': 'judaism',
                   'judaism and very serious about it': 'judaism',
                   'judaism and somewhat serious about it': 'judaism',
                   'judaism but not too serious about it': 'judaism',
                   'hinduism and somewhat serious about it': 'hinduism',
                   'hinduism and very serious about it': 'hinduism',
                   'hinduism but not too serious about it': 'hinduism',
                   'hinduism and laughing about it': 'hinduism',
                   'islam but not too serious about it': 'islam',
                   'islam and laughing about it': 'islam',
                   'islam and somewhat serious about it': 'islam',
                   'islam and very serious about it': 'islam',
                  }
religion_df['religion'] = religion_df['religion'].map(rel_vals_to_replace)
religion_df.religion.unique()

array(['agnosticism', nan, 'christianity', 'atheism', 'catholicism',
       'buddhism', 'other', 'judaism', 'hinduism', 'islam'], dtype=object)

In [7]:
# Drop all rows that have none for religion
religion_df = df.replace({'NaN': None})
religion_df = df[df.religion.notna()]
religion_df

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,,transportation,...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
8,24,,strictly anything,socially,,graduated from college/university,white,67.0,,,...,"belvedere tiburon, california",doesn&rsquo;t have kids,straight,likes dogs and likes cats,christianity,f,gemini but it doesn&rsquo;t matter,when drinking,english,single
9,37,athletic,mostly anything,not at all,never,working on two-year college,white,65.0,,student,...,"san mateo, california",,straight,likes dogs and likes cats,atheism,m,cancer but it doesn&rsquo;t matter,no,english (fluently),single
11,28,average,mostly anything,socially,never,graduated from college/university,white,72.0,40000,banking / financial / real estate,...,"daly city, california",,straight,likes cats,christianity,m,leo but it doesn&rsquo;t matter,no,"english (fluently), sign language (poorly)",seeing someone
13,30,skinny,mostly anything,socially,never,graduated from high school,white,66.0,30000,sales / marketing / biz dev,...,"san francisco, california",,straight,has dogs and likes cats,christianity,f,,no,english,single
15,39,fit,strictly anything,socially,,graduated from college/university,white,65.0,,medicine / health,...,"san francisco, california",doesn&rsquo;t have kids,straight,likes dogs and has cats,atheism,f,aquarius but it doesn&rsquo;t matter,no,"english (fluently), spanish (okay)",single
19,33,athletic,mostly anything,socially,never,graduated from masters program,white,72.0,,science / tech / engineering,...,"san francisco, california",doesn&rsquo;t have kids,straight,likes dogs and likes cats,catholicism,m,pisces and it&rsquo;s fun to think about,no,english (fluently),single
21,22,athletic,,socially,never,working on college/university,,67.0,,student,...,"san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,catholicism,f,taurus but it doesn&rsquo;t matter,no,"english (fluently), spanish (fluently), italia...",single
22,30,fit,mostly anything,socially,never,graduated from college/university,white,69.0,,executive / management,...,"san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,sagittarius but it doesn&rsquo;t matter,no,"english, spanish (poorly)",single


In [8]:
# Create dataframe showing relationship between religion and jobs
rel_v_job = religion_df.groupby(['religion', 'job']).count().unstack(level=1)
rel_v_job

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,...,status,status,status,status,status,status,status,status,status,status
job,artistic / musical / writer,banking / financial / real estate,clerical / administrative,computer / hardware / software,construction / craftsmanship,education / academia,entertainment / media,executive / management,hospitality / travel,law / legal services,...,military,other,political / government,rather not say,retired,sales / marketing / biz dev,science / tech / engineering,student,transportation,unemployed
religion,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
agnosticism,513.0,166.0,87.0,726.0,83.0,341.0,304.0,248.0,162.0,144.0,...,15.0,744.0,74.0,37.0,16.0,416.0,632.0,587.0,28.0,34.0
atheism,409.0,106.0,64.0,764.0,59.0,236.0,190.0,143.0,85.0,93.0,...,12.0,561.0,50.0,31.0,16.0,227.0,642.0,522.0,23.0,32.0
buddhism,180.0,48.0,22.0,119.0,29.0,136.0,52.0,68.0,22.0,29.0,...,5.0,211.0,16.0,10.0,8.0,77.0,116.0,125.0,11.0,5.0
catholicism,166.0,191.0,100.0,219.0,83.0,260.0,135.0,158.0,114.0,111.0,...,23.0,493.0,69.0,36.0,13.0,391.0,224.0,322.0,49.0,23.0
christianity,182.0,206.0,69.0,217.0,103.0,218.0,129.0,200.0,76.0,95.0,...,40.0,536.0,51.0,35.0,27.0,397.0,255.0,418.0,39.0,37.0
hinduism,15.0,22.0,1.0,68.0,,9.0,10.0,21.0,3.0,8.0,...,,13.0,2.0,4.0,1.0,28.0,53.0,41.0,,
islam,7.0,3.0,1.0,7.0,2.0,1.0,5.0,4.0,5.0,2.0,...,1.0,17.0,1.0,,,1.0,7.0,15.0,2.0,1.0
judaism,181.0,93.0,18.0,205.0,15.0,214.0,113.0,118.0,47.0,105.0,...,5.0,305.0,28.0,14.0,17.0,231.0,179.0,185.0,7.0,7.0
other,639.0,108.0,74.0,345.0,137.0,297.0,192.0,195.0,143.0,81.0,...,7.0,937.0,63.0,70.0,50.0,282.0,299.0,377.0,47.0,46.0


# Astrological Sign & Job/Occupation: Cephra

In [9]:
sign_df = df
sign_df['sign'].unique()

array(['gemini', 'cancer', 'pisces but it doesn&rsquo;t matter', 'pisces',
       'aquarius', 'taurus', 'virgo', 'sagittarius',
       'gemini but it doesn&rsquo;t matter',
       'cancer but it doesn&rsquo;t matter',
       'leo but it doesn&rsquo;t matter', nan,
       'aquarius but it doesn&rsquo;t matter',
       'aries and it&rsquo;s fun to think about',
       'libra but it doesn&rsquo;t matter',
       'pisces and it&rsquo;s fun to think about', 'libra',
       'taurus but it doesn&rsquo;t matter',
       'sagittarius but it doesn&rsquo;t matter',
       'scorpio and it matters a lot',
       'gemini and it&rsquo;s fun to think about',
       'leo and it&rsquo;s fun to think about',
       'cancer and it&rsquo;s fun to think about',
       'libra and it&rsquo;s fun to think about',
       'aquarius and it&rsquo;s fun to think about',
       'virgo but it doesn&rsquo;t matter',
       'scorpio and it&rsquo;s fun to think about',
       'capricorn but it doesn&rsquo;t matter', 'sc

In [10]:
# Clean up astrology ('sign') column
sign_vals_to_replace = {'pisces but it doesn&rsquo;t matter':'pisces',
                       'gemini but it doesn&rsquo;t matter': 'gemini',
                       'cancer but it doesn&rsquo;t matter': 'cancer',
                       'leo but it doesn&rsquo;t matter': 'leo',
                       'aquarius but it doesn&rsquo;t matter': 'aquarius',
                       'aries and it&rsquo;s fun to think about': 'aries',
                       'libra but it doesn&rsquo;t matter': 'libra',
                       'pisces and it&rsquo;s fun to think about': 'pisces',
                       'taurus but it doesn&rsquo;t matter': 'taurus',
                       'sagittarius but it doesn&rsquo;t matter': 'sagittarius',
                       'scorpio and it matters a lot': 'scorpio',
                       'gemini and it&rsquo;s fun to think about': 'gemini',
                       'leo and it&rsquo;s fun to think about': 'leo',
                       'cancer and it&rsquo;s fun to think about': 'cancer',
                       'libra and it&rsquo;s fun to think about': 'libra',
                       'aquarius and it&rsquo;s fun to think about': 'aquarius',
                       'virgo but it doesn&rsquo;t matter': 'virgo',
                       'scorpio and it&rsquo;s fun to think about': 'scorpio',
                       'capricorn but it doesn&rsquo;t matter': 'capricorn',
                       'capricorn and it&rsquo;s fun to think about': 'capricorn',
                       'aries but it doesn&rsquo;t matter': 'aries',
                       'scorpio but it doesn&rsquo;t matter': 'scorpio',
                       'sagittarius and it&rsquo;s fun to think about': 'sagittarius',
                       'libra and it matters a lot': 'libra',
                       'taurus and it&rsquo;s fun to think about': 'taurus',
                       'leo and it matters a lot': 'leo',
                       'virgo and it&rsquo;s fun to think about': 'virgo',
                       'cancer and it matters a lot': 'cancer',
                       'pisces and it matters a lot': 'pisces',
                       'aries and it matters a lot': 'aries',
                       'capricorn and it matters a lot': 'capricorn',
                       'aquarius and it matters a lot': 'aquarius',
                       'sagittarius and it matters a lot': 'sagittarius',
                       'gemini and it matters a lot': 'gemini',
                       'taurus and it matters a lot': 'taurus',
                       'virgo and it matters a lot': 'virgo'
                  }
sign_df['sign'] = sign_df['sign'].map(sign_vals_to_replace)
sign_df.sign.unique()

array([nan, 'pisces', 'gemini', 'cancer', 'leo', 'aquarius', 'aries',
       'libra', 'taurus', 'sagittarius', 'scorpio', 'virgo', 'capricorn'],
      dtype=object)

In [None]:
religion

In [8]:
#%% Investigate heights vs sex vs age
g=df.groupby(["sex","age"])["income"].mean()
fig,(ax1,ax2)=plt.subplots(ncols=2,sharex=True,figsize=(10,3))
ax1.plot(g["m"],color="b")
ax1.set_xlim(18,50)
ax1.set_ylim(0,100000)
ax1.set(title="Average income vs age for males",
        ylabel="income",
        xlabel="age")
ax2.plot(g["f"],color="r")
ax2.set_xlim(18,50)
ax2.set_ylim(0,100000)
ax2.set(title="Average income vs age for females",
        ylabel="income",
        xlabel="age");
for ax in (ax1,ax2):
    sns.despine(ax=ax)
fig.tight_layout()

DataError: No numeric types to aggregate

In [None]:
fig,(ax1,ax2) = plt.subplots(ncols=2,figsize=(10,3),sharey=True,sharex=True)
sns.distplot(m["income"], ax=ax1,
             bins=range(20000,1000000),
             kde=False,
             color="b")
ax1.set_title("income distribution for males")
sns.distplot(f["income"], ax=ax2,
             bins=range(20000,1000000),
             kde=False,
             color="r")
ax2.set_title("income distribution for females")
ax1.set_ylabel("Number of users in income group")
for ax in (ax1,ax2):
    sns.despine(ax=ax)
fig.tight_layout()



# New Assignments for Team
 - Jarvis to focus on lat/lng & plotting geolocation (Geography)
 - Ryan to focus on jobs vs. drugs, drinks, smoking, sexual orientation (Substance Use)
 - Tati to focus on jobs vs. age, income, education (Economics)
 - Cephra to focus on religion, astrological sign, offspring (Spiritual)

#age	body_type	diet	drinks	drugs	education	ethnicity	height	income	job	last_online	location	offspring	orientation	pets	religion	sex	sign	smokes	speaks	status


#age vs drinks vs drug
#age vs religion
#location vs sign
#age vs smoke
#height vs income
#income vs drinks vs drug
#income vs education
#body_type vs location
#ethnicity vs job or income
    


