In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 80)
pd.set_option('display.max_colwidth', 50)

## Exploratory Analysis of Profile's DataFrame

In [3]:
profile_df = pd.read_csv('profiles.csv')
profile_df.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,"asian, white",75.0,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,,graduated from masters program,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",,68.0,-1,,2012-06-27-09-10,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,,working on college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,,socially,never,graduated from college/university,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,"asian, black, other",66.0,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [4]:
profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   body_type    54650 non-null  object 
 2   diet         35551 non-null  object 
 3   drinks       56961 non-null  object 
 4   drugs        45866 non-null  object 
 5   education    53318 non-null  object 
 6   essay0       54458 non-null  object 
 7   essay1       52374 non-null  object 
 8   essay2       50308 non-null  object 
 9   essay3       48470 non-null  object 
 10  essay4       49409 non-null  object 
 11  essay5       49096 non-null  object 
 12  essay6       46175 non-null  object 
 13  essay7       47495 non-null  object 
 14  essay8       40721 non-null  object 
 15  essay9       47343 non-null  object 
 16  ethnicity    54266 non-null  object 
 17  height       59943 non-null  float64
 18  income       59946 non-null  int64  
 19  job 

## Data Cleansing 

In [5]:
# generating unique values of each columns
for col in profile_df.columns:
    print('Column '+ col)
    print(profile_df[col].unique())
    print()

Column age
[ 22  35  38  23  29  32  31  24  37  28  30  39  33  26  27  20  25  40
  36  21  34  43  46  41  42  45  18  55  50  59  44  48  54  51  62  52
  19  58  66  53  63  47  49  61  60  57  56  65  64  68 110  69  67 109]

Column body_type
['a little extra' 'average' 'thin' 'athletic' 'fit' nan 'skinny' 'curvy'
 'full figured' 'jacked' 'rather not say' 'used up' 'overweight']

Column diet
['strictly anything' 'mostly other' 'anything' 'vegetarian' nan
 'mostly anything' 'mostly vegetarian' 'strictly vegan'
 'strictly vegetarian' 'mostly vegan' 'strictly other' 'mostly halal'
 'other' 'vegan' 'mostly kosher' 'strictly halal' 'halal'
 'strictly kosher' 'kosher']

Column drinks
['socially' 'often' 'not at all' 'rarely' nan 'very often' 'desperately']

Column drugs
['never' 'sometimes' nan 'often']

Column education
['working on college/university' 'working on space camp'
 'graduated from masters program' 'graduated from college/university'
 'working on two-year college' nan 'grad

["books:<br />\nabsurdistan, the republic, of mice and men (only book that made me\nwant to cry), catcher in the rye, the prince.<br />\n<br />\nmovies:<br />\ngladiator, operation valkyrie, the producers, down periscope.<br />\n<br />\nshows:<br />\nthe borgia, arrested development, game of thrones, monty\npython<br />\n<br />\nmusic:<br />\naesop rock, hail mary mallon, george thorogood and the delaware\ndestroyers, felt<br />\n<br />\nfood:<br />\ni'm down for anything."
 "i am die hard christopher moore fan. i don't really watch a lot of\ntv unless there is humor involved. i am kind of stuck on 90's\nalternative music. i am pretty much a fan of everything though... i\ndo need to draw a line at most types of electronica."
 'okay this is where the cultural matrix gets so specific, it\'s like\nbeing in the crosshairs.<br />\n<br />\nfor what it\'s worth, i find myself reading more non-fiction than\nfiction. it\'s usually some kind of philosophy, art, or science text\nby silly authors 

In [6]:
#replacing NaN Values in the body_type and diet columns
profile_df = profile_df.fillna(value = {'body_type' : 'rather not say', 'diet' : 'other'})

In [7]:
#removing a record with index 2512 because it is contains unnecessary data, generated by the user
profile_df.drop(profile_df.index[2512], inplace = True)

In [8]:
#removing a record with index 2512 because it is contains unnecessary data, generated by the user
profile_df.drop(profile_df.index[25324], inplace = True)

In [9]:
#replacing NaN Values in the drinks columns
profile_df = profile_df.fillna(value = {'drinks': 'socially'})

In [10]:
#replacing NaN Values in the drugs columns
profile_df = profile_df.fillna(value = {'drugs': 'never'})

In [11]:
#replacing value in the column ethnicity - 'hispanic/latin' to just 'latin' to achieve simplicity
profile_df['ethnicity'].replace(to_replace = 'hispanic / latin', value = 'latin', inplace = True)

In [12]:
#replacing NaN Values in the ethnicity columns
profile_df = profile_df.fillna(value = {'ethnicity': 'prefer not to mention'})

In [13]:
#since in ethnicity columns has values with more than 2 ethnicities
#I made the assumptions that users who input more than or equal to 2 ethnicities are born with 'mixed' ethnicities
#Therefore, I replace all of these values with just 'mixed' as it is much simpler than having values like 'white, asian, latin'
for x in profile_df['ethnicity'].values:
    if (',' in x) == True:
        profile_df['ethnicity'].replace(to_replace = x, value = 'mixed', inplace = True)

In [14]:
#replacing NaN Values in the height columns with the average ages across the whole dataset
profile_df = profile_df.fillna(value = {'height': 68.})

In [15]:
# there are 6 records in the dataset with the height less than 26 inches
# I made the assumptions that these records are not valid and therefore, its best to remove them
profile_df.drop(profile_df[profile_df['height'] < 26.].index, inplace = True)

In [16]:
# just resetting the index number of this dataset
profile_df.reset_index(drop = True)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,mixed,75.0,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,never,graduated from masters program,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",prefer not to mention,68.0,-1,,2012-06-27-09-10,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,never,working on college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,other,socially,never,graduated from college/university,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,mixed,66.0,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59933,59,rather not say,other,socially,never,graduated from college/university,"vibrant, expressive, caring optimist. i love b...",the happiest times have been when life came to...,i make an outstanding osso bucco. i am also ve...,"i am told that people notice my smile, eyes an...",i am an avid movie watcher and follow the broa...,"my family, my dog, italy, words and music!",writing my book.,"running with my dog, finishing up the work wee...",i have a dream to sing at the alconquin in nyc...,you are seeking a long term connection of shar...,prefer not to mention,62.0,-1,sales / marketing / biz dev,2012-06-12-21-47,"oakland, california",has kids,straight,has dogs,catholicism but not too serious about it,f,cancer and it&rsquo;s fun to think about,no,english,single
59934,24,fit,mostly anything,often,sometimes,working on college/university,i'm nick.<br />\ni never know what to write ab...,currently finishing school for film production...,"<a class=""ilink"" href=""/interests?i=filmmaking...","dude, i don't know.",<strong>movies</strong>:<br />\nhook (the grea...,iphone<br />\ncontact lenses<br />\nheadphones...,i do most of my thinking on the bus to/from wo...,"bringin' home bacon, or drinking and shakin'!",when i was 18 i got a tattoo of waldo somewher...,meh if you made it this far you might as well.,mixed,72.0,-1,entertainment / media,2012-06-29-11-01,"san francisco, california",doesn&rsquo;t have kids,straight,likes dogs and likes cats,agnosticism,m,leo but it doesn&rsquo;t matter,no,english (fluently),single
59935,42,average,mostly anything,not at all,never,graduated from masters program,"hello! i enjoy traveling, watching movies, and...","i'm a civil engineer, who enjoys helping the c...",- looking at things objectively<br />\n- getti...,i'm quiet until i get used to the environment ...,"last book: ""game change"".<br />\nmovies: bourn...",- iphone<br />\n- friends and family<br />\n- ...,"aside from work, how to improve my home.",out enjoying friendly conversation over dinner.,please let me think about this more.,we have similar interests.,asian,71.0,100000,construction / craftsmanship,2012-06-27-23-37,"south san francisco, california",doesn&rsquo;t have kids,straight,,christianity but not too serious about it,m,sagittarius but it doesn&rsquo;t matter,no,english (fluently),single
59936,27,athletic,mostly anything,socially,often,working on college/university,"""all i have in this world are my balls and my ...","following my dreams...<br />\n""you got a dream...",listening,it used to be the hair until i mowed it off bu...,where to begin musically: right now i listen t...,"music, family, friends, a basketball, hoop, so...",what can i do to make someone chuckle....,what i would do on any other day. everydays a ...,i like walking around in other people's house ...,you are interested and interesting...,mixed,73.0,-1,medicine / health,2012-06-23-13-01,"san francisco, california","doesn&rsquo;t have kids, but wants them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,leo and it&rsquo;s fun to think about,trying to quit,"english (fluently), spanish (poorly), chinese ...",single


In [17]:
#replacing non-valid values in the income column, from value '-1' to value '0'
#it does not make sense to have an income less than 0
profile_df['income'].replace(to_replace = -1, value = 0, inplace = True)

In [18]:
# replacing NaN values in the job column
profile_df = profile_df.fillna(value = {'job': 'rather not say'})

In [19]:
# tidying the values in the offspring column as some records contains the word 'doesn&rsquo;t', which does not make sense
for y in profile_df['offspring'].values:
    if y == 'doesn&rsquo;t have kids':
        profile_df['offspring'].replace(to_replace = y, value = 'does not have kids', inplace = True)
    elif y == 'doesn&rsquo;t have kids, but might want them':
        profile_df['offspring'].replace(to_replace = y, value = 'does not have kids, but might want them', inplace = True)
    elif y == 'doesn&rsquo;t have kids, but wants them':
        profile_df['offspring'].replace(to_replace = y, value = 'does not have kids, but wants them', inplace = True)
    elif y == 'doesn&rsquo;t want kids':
        profile_df['offspring'].replace(to_replace = y, value = 'does not want kids', inplace = True)
    elif y == 'doesn&rsquo;t have kids, and doesn&rsquo;t want any':
        profile_df['offspring'].replace(to_replace = y, value = 'does not have nor want any kids', inplace = True)
    elif y == 'has kids, but doesn&rsquo;t want more':
        profile_df['offspring'].replace(to_replace = y, value = 'has kids, but does not want more', inplace = True)
    elif y == 'has a kid, but doesn&rsquo;t want more':
        profile_df['offspring'].replace(to_replace = y, value = 'has a kid, but does not want more', inplace = True)

In [20]:
#replacing NaN values with 'other pets'. I made the assumption that these users with NaN values in the pets column, have other pets
profile_df = profile_df.fillna(value = {'pets' : 'other pets'})

In [21]:
# replacing NaN values in the religion column with the input 'prefer not to mention'
profile_df = profile_df.fillna(value = {'religion' : 'prefer not to mention'})

In [22]:
for z in profile_df['religion'].values:
    if ('agnosticism' in z) == True:
        profile_df['religion'].replace(to_replace = z, value = 'agnosticism', inplace = True)
    elif ('other' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'other', inplace = True)
    elif ('atheism' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'atheism', inplace = True)
    elif ('christianity' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'christianity', inplace = True)
    elif ('judaism' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'judaism', inplace = True)
    elif ('catholicism' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'catholicism', inplace = True)
    elif ('buddhism' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'buddhism', inplace = True)
    elif ('hinduism' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'hinduism', inplace = True)
    elif ('islam' in z) == True: 
        profile_df['religion'].replace(to_replace = z, value = 'islam', inplace = True)
        

In [23]:
# filling NaN values in the sign column with the value of 'prefer not to mention'
# Assuming that the users wanted to keep this personal
profile_df = profile_df.fillna(value = {'sign': 'prefer not to mention'})

In [24]:
# aggregate the values in the sign column, regardless what the users think or feel about their own sign
for a in profile_df['sign'].values:
    if ('gemini' in a) == True:
        profile_df['sign'].replace(to_replace = a, value = 'gemini', inplace = True)
    elif ('scorpio' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'scorpio', inplace = True)
    elif ('leo' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'leo', inplace = True)
    elif ('libra' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'libra', inplace = True)
    elif ('taurus' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'taurus', inplace = True)
    elif ('cancer' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'cancer', inplace = True)
    elif ('pisces' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'pisces', inplace = True)
    elif ('sagittarius' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'sagittarius', inplace = True)
    elif ('virgo' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'virgo', inplace = True)
    elif ('aries' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'aries', inplace = True)
    elif ('aquarius' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'aquarius', inplace = True)
    elif ('capricorn' in a) == True: 
        profile_df['sign'].replace(to_replace = a, value = 'capricorn', inplace = True)

In [25]:
# filling NaN values in the smokes column with the value of 'prefer not to mention'
profile_df = profile_df.fillna(value = {'smokes': 'prefer not to mention'})

In [26]:
# Mapping smokes categorical responses to numeric values
# 'sometimes' 'no' nan 'when drinking' 'yes' 'trying to quit'
smoke_mapping = {"prefer not to mention": 0, "no": 1, "sometimes": 2, "when drinking": 3, 
                 "trying to quit": 4, "yes": 4}

profile_df["smokes_code"] = profile_df['smokes'].map(smoke_mapping)

In [27]:
# Mapping drink categorical responses to numeric values
drink_mapping = {"not at all": 0, "rarely": 1, "socially": 2, "often": 3,
                 "very often": 4, "desperately": 5}

profile_df["drinks_code"] = profile_df['drinks'].map(drink_mapping)

In [28]:
#filling NaN values in the education column with the value of 'no education'
profile_df = profile_df.fillna(value = {'education' : 'no education'})

In [29]:
# mapping the values in the education column 
education_map = {'working on college/university' : 'college/university', 'working on space camp' : 'space camp',
                 'graduated from masters program' : 'Masters Graduate', 'graduated from college/university' : 'College/University Graduate',
                 'working on two-year college' : 'two-year college', 
                 'graduated from high school' : 'High School Graduate',
                 'working on masters program' : 'masters program', 
                 'graduated from space camp' : 'Space Camp Graduate', 
                 'dropped out of space camp': 'drop-out',
                 'graduated from ph.d program': 'PhD Graduate', 
                 'graduated from law school': 'Law School Graduate',
                 'working on ph.d program': 'ph.d program',
                 'graduated from two-year college': 'Two-Year College Graduate',
                 'working on med school': 'med school',
                 'dropped out of college/university': 'drop-out',
                 'graduated from med school': 'Med School Graduate',
                 'dropped out of high school': 'drop-out',
                 'working on high school': 'high school',
                 'dropped out of ph.d program': 'drop-out',
                 'dropped out of two-year college': 'drop-out',
                 'dropped out of med school': 'drop-out',
                 'working on law school': 'law school',
                 'dropped out of masters program': 'drop-out',
                 'dropped out of law school': 'drop-out'}

profile_df['education'] = profile_df['education'].map(education_map)

In [30]:
# Cleaning the NaN values in the essay response columns
essay_cols = ["essay0","essay1","essay2","essay3","essay4","essay5","essay6","essay7","essay8","essay9"]
for essay in essay_cols:
    profile_df[essay].replace(np.nan, " ", inplace = True)

In [31]:
# creating a new column called 'all essay', which contains the combination of all responses 
profile_df['all_essay'] = profile_df[essay_cols].apply(lambda x: ' '.join(x), axis=1)

In [32]:
# creating another new column called 'essay total length', which contains the legnth of the total responses. 
profile_df['essay_total_length'] = profile_df['all_essay'].apply(lambda x: len(x))

In [33]:
profile_df.head(10)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,smokes_code,drinks_code,all_essay,essay_total_length
0,22,a little extra,strictly anything,socially,never,college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,mixed,75.0,0,transportation,2012-06-28-20-30,"south san francisco, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,gemini,sometimes,english,single,2,2,about me:<br />\n<br />\ni would love to think...,2644
1,35,average,mostly other,often,sometimes,space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,1,3,i am a chef: this is what that means.<br />\n1...,1457
2,38,thin,anything,socially,never,Masters Graduate,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",prefer not to mention,68.0,0,rather not say,2012-06-27-09-10,"san francisco, california",,straight,has cats,prefer not to mention,m,pisces,no,"english, french, c++",available,1,2,"i'm not ashamed of much, but writing public te...",5518
3,23,thin,vegetarian,socially,never,college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",does not want kids,straight,likes cats,prefer not to mention,m,pisces,no,"english, german (poorly)",single,1,2,i work in a library and go to school. . . read...,480
4,29,athletic,other,socially,never,College/University Graduate,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,mixed,66.0,0,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,prefer not to mention,m,aquarius,no,english,single,1,2,hey how's it going? currently vague on the pro...,730
5,29,average,mostly anything,socially,never,College/University Graduate,"i'm an australian living in san francisco, but...",building awesome stuff. figuring out what's im...,imagining random shit. laughing at aforementio...,i have a big smile. i also get asked if i'm we...,"books: to kill a mockingbird, lord of the ring...","like everyone else, i love my friends and fami...",what my contribution to the world is going to ...,out with my friends!,i cried on my first day at school because a bi...,you're awesome.,white,67.0,0,computer / hardware / software,2012-06-29-19-18,"san francisco, california","does not have kids, but might want them",straight,likes cats,atheism,m,taurus,no,"english (fluently), chinese (okay)",single,1,2,"i'm an australian living in san francisco, but...",2469
6,32,fit,strictly anything,socially,never,College/University Graduate,life is about the little things. i love to lau...,digging up buried treasure,frolicking<br />\nwitty banter<br />\nusing my...,i am the last unicorn,i like books. ones with pictures. reading them...,laughter<br />\namazing people in my life<br /...,synchronicity<br />\n<br />\nthere is this who...,plotting to take over the world with my army o...,my typical friday night,,mixed,65.0,0,rather not say,2012-06-25-20-45,"san francisco, california",,straight,likes dogs and likes cats,prefer not to mention,f,virgo,prefer not to mention,english,single,0,2,life is about the little things. i love to lau...,1918
7,31,average,mostly anything,socially,never,College/University Graduate,,"writing. meeting new people, spending time wit...","remembering people's birthdays, sending cards,...",i'm rather approachable (a byproduct of being ...,"i like: alphabetized lists, aquariums, autobio...","friends, family, notebook/pen, books, music, t...",things that amuse and inspire me,out and about or relaxing at home with a good ...,,,white,65.0,0,artistic / musical / writer,2012-06-29-12-30,"san francisco, california","does not have kids, but wants them",straight,likes dogs and likes cats,christianity,f,sagittarius,no,"english, spanish (okay)",single,1,2,"writing. meeting new people, spending time w...",1243
8,24,rather not say,strictly anything,socially,never,College/University Graduate,,"oh goodness. at the moment i have 4 jobs, so i...",,i'm freakishly blonde and have the same name a...,i am always willing to try new foods and am no...,sports/my softball glove<br />\ncoffee. becaus...,,"in or out... drinking with friends, maybe a ba...",potential friends/lovers/people who come in co...,http://www.youtube.com/watch?v=4dxbwzuwsxk let...,white,67.0,0,rather not say,2012-06-29-23-39,"belvedere tiburon, california",does not have kids,straight,likes dogs and likes cats,christianity,f,gemini,when drinking,english,single,3,2,"oh goodness. at the moment i have 4 jobs, so...",2250
9,37,athletic,mostly anything,not at all,never,two-year college,my names jake.<br />\ni'm a creative guy and i...,i have an apartment. i like to explore and che...,i'm good at finding creative solutions to prob...,i'm short,i like some tv. i love summer heights high and...,"music, my guitar<br />\ncontrast<br />\ngood f...",<strong><em>you should</em></strong>,<strong><em>send a message</em></strong>,<em><strong>and say hi.</strong></em>,you can rock the bells,white,65.0,0,student,2012-06-28-21-08,"san mateo, california",,straight,likes dogs and likes cats,atheism,m,cancer,no,english (fluently),single,1,0,my names jake.<br />\ni'm a creative guy and i...,2420


In [34]:
# creating another new column called ' average_word_length', which contains the average length of responses that each user gives
def average_word(words):
    count = 0
    word_length = len(words)
    word_list = words.split()
    for word in word_list:
        count += len(word)
    average = count/word_length
    return average

profile_df['average_word_length'] = profile_df['all_essay'].apply(lambda x: average_word(x))

profile_df.head()


Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,smokes_code,drinks_code,all_essay,essay_total_length,average_word_length
0,22,a little extra,strictly anything,socially,never,college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,mixed,75.0,0,transportation,2012-06-28-20-30,"south san francisco, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,gemini,sometimes,english,single,2,2,about me:<br />\n<br />\ni would love to think...,2644,0.816944
1,35,average,mostly other,often,sometimes,space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,1,3,i am a chef: this is what that means.<br />\n1...,1457,0.804393
2,38,thin,anything,socially,never,Masters Graduate,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",prefer not to mention,68.0,0,rather not say,2012-06-27-09-10,"san francisco, california",,straight,has cats,prefer not to mention,m,pisces,no,"english, french, c++",available,1,2,"i'm not ashamed of much, but writing public te...",5518,0.838891
3,23,thin,vegetarian,socially,never,college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",does not want kids,straight,likes cats,prefer not to mention,m,pisces,no,"english, german (poorly)",single,1,2,i work in a library and go to school. . . read...,480,0.825
4,29,athletic,other,socially,never,College/University Graduate,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,mixed,66.0,0,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,prefer not to mention,m,aquarius,no,english,single,1,2,hey how's it going? currently vague on the pro...,730,0.830137


In [35]:
# I also created an extra column called 'count word i, I and me'
# This column contains the number of times the user used the word i, I and me

def count_word(words):
    count = 0
    alphabet = ['i', 'I', 'me']
    for element in alphabet:
        count += words.count(element)
    return count

profile_df['count_word_i_and_me'] = profile_df['all_essay'].apply(lambda x: count_word(x))

profile_df.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,height,income,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status,smokes_code,drinks_code,all_essay,essay_total_length,average_word_length,count_word_i_and_me
0,22,a little extra,strictly anything,socially,never,college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,mixed,75.0,0,transportation,2012-06-28-20-30,"south san francisco, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,gemini,sometimes,english,single,2,2,about me:<br />\n<br />\ni would love to think...,2644,0.816944,159
1,35,average,mostly other,often,sometimes,space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","does not have kids, but might want them",straight,likes dogs and likes cats,agnosticism,m,cancer,no,"english (fluently), spanish (poorly), french (...",single,1,3,i am a chef: this is what that means.<br />\n1...,1457,0.804393,103
2,38,thin,anything,socially,never,Masters Graduate,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,okay this is where the cultural matrix gets so...,movement<br />\nconversation<br />\ncreation<b...,,viewing. listening. dancing. talking. drinking...,"when i was five years old, i was known as ""the...","you are bright, open, intense, silly, ironic, ...",prefer not to mention,68.0,0,rather not say,2012-06-27-09-10,"san francisco, california",,straight,has cats,prefer not to mention,m,pisces,no,"english, french, c++",available,1,2,"i'm not ashamed of much, but writing public te...",5518,0.838891,372
3,23,thin,vegetarian,socially,never,college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,"bataille, celine, beckett. . .<br />\nlynch, j...",,cats and german philosophy,,,you feel so inclined.,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",does not want kids,straight,likes cats,prefer not to mention,m,pisces,no,"english, german (poorly)",single,1,2,i work in a library and go to school. . . read...,480,0.825,32
4,29,athletic,other,socially,never,College/University Graduate,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,"music: bands, rappers, musicians<br />\nat the...",,,,,,mixed,66.0,0,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,straight,likes dogs and likes cats,prefer not to mention,m,aquarius,no,english,single,1,2,hey how's it going? currently vague on the pro...,730,0.830137,39


In [36]:
# replacing NaN values in the speaks column with empty space
profile_df['speaks'] = profile_df['speaks'].replace(np.nan, " ")

In [37]:
# filling NaN values in the offspring column with the assumption that the affected user has not thought about kids at all!
profile_df = profile_df.fillna(value = {'offspring' : 'has not thought about it yet'})

In [38]:
# filling NaN values in the education column with the assumption that the affected user prefer to keep his education information confidential
profile_df = profile_df.fillna(value = {'education' : 'prefer not to mention'})

In [39]:
profile_df['drugs'].value_counts()

never        51797
sometimes     7732
often          409
Name: drugs, dtype: int64

In [40]:
profile_df['drugs'].isnull().value_counts()

False    59938
Name: drugs, dtype: int64

In [41]:
drug_map = {'never' : 0, 'sometimes' : 1, 'often' : 2}
profile_df['drugs_code'] = profile_df['drugs'].map(drug_map)

In [42]:
profile_df.to_csv('cleansed_profile.csv', index = False, header = True)