In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = 50
pd.options.display.width = 1000

In [2]:
raw = open('/Users/Dongzhe/Desktop/cs249/Final_Part01/Personality/PersonalityRawData.csv', 'r')
PersonalityRawData = raw.readlines()
raw.close()

len(PersonalityRawData)

49160

In [3]:
import re
bad_data_patterns = [ r',,',   # missing values
                     r'0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,',
                     r'1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,',
                     r'2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,',  # too many instances of a score
                     r'3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,',
                     r'4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,',
                     r'5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5',
                     r'^[^1]*[A-Z]',
                     r'^[^2]*[A-Z]',
                     r'^[^3]*[A-Z]',  # no instances of a score
                     r'^[^4]*[A-Z]',
                     r'^[^5]*[A-Z]',
                     r',A1,',
                     r',A2,',   # bad country codes
                     r',O1,', ]

compiled_bad_data_patterns = [re.compile(pattern) for pattern in bad_data_patterns]

def not_bad(line):
    for pattern in compiled_bad_data_patterns:
        if pattern.search(line):
            return False
    return True

In [4]:
clean_lines = [PersonalityRawData[0]]   #  Header line in the .csv file
for line in PersonalityRawData[1:]:
    if not_bad(line):
        cleaned_line = line.replace(',NA,',',NM,')  # replace the country code of Namibia
        clean_lines += [cleaned_line]

In [5]:
clean = open('PersonalityCleanedData.csv', 'w')
clean.write( ''.join(clean_lines) )
clean.close()

Personality = pd.DataFrame.from_csv('PersonalityCleanedData.csv').reset_index()

Personality.shape

(46828, 169)

In [6]:
Personality = Personality[ Personality.age <= 90 ]       # omit very high age values
Personality = Personality[ Personality.gender >= 1 ]
Personality = Personality[ Personality.gender <= 2 ]     # omit gender codes 0 and 3 (see below)
Personality = Personality[ Personality.elapsed < 5000 ]  # omit questionnaire results taking long than 5000 seconds
Personality = Personality[ Personality.accuracy > 80 ]   # omit accuracy values below 80
Personality = Personality[ Personality.accuracy <= 100 ] # omit accuracy values above 100

Personality.shape

(34798, 169)

In [7]:
print( Personality[Personality.age > 80].shape )


(6, 169)


In [8]:
Personality.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,C1,C2,...,O2,O3,O4,O5,O6,O7,O8,O9,O10,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,age,gender,accuracy,country,source,elapsed
0,1,4,2,3,3,2,3,4,4,3,4,4,5,4,5,4,5,4,1,2,1,1,1,4,5,...,2,3,4,3,2,2,4,4,4,5,5,5,4,4,5,1,2,5,2,17,1,92,US,6,914
1,4,3,4,3,4,4,4,4,2,2,4,4,4,4,5,4,3,2,3,2,4,1,1,1,2,...,2,4,2,3,2,2,3,4,4,4,4,4,2,3,2,3,2,2,2,37,1,100,US,1,891
3,4,5,4,4,4,3,3,2,2,2,4,2,4,5,4,5,4,4,3,3,3,2,4,3,2,...,2,2,2,3,4,5,4,5,4,4,2,2,2,3,4,2,3,4,4,32,1,93,US,1,806
4,4,0,4,4,4,3,5,1,2,4,2,4,4,5,5,4,4,5,4,1,5,1,2,2,4,...,3,5,3,4,2,1,2,1,4,4,3,4,3,4,3,4,2,3,2,46,2,87,NZ,1,1826
7,4,5,4,4,4,4,5,2,2,2,3,3,4,5,4,4,4,5,4,2,3,1,1,3,4,...,3,4,4,4,4,2,4,4,3,2,3,2,2,3,2,1,4,4,3,61,2,100,US,1,567


In [8]:
P = Personality

Warmth = P.A1 + P.A2 + P.A3 + P.A4 + P.A5 + P.A6 + P.A7 - P.A8 - P.A9 - P.A10
Intellect = P.B1 + P.B2 + P.B3 + P.B4 + P.B5 + P.B6 + P.B7 + P.B8 - P.B9 - P.B10 - P.B11 - P.B12 - P.B13
Stability = P.C1 + P.C2 + P.C3 + P.C4 + P.C5 - P.C6 - P.C7 - P.C8 - P.C9 - P.C10
Assertiveness = P.D1 + P.D2 + P.D3 + P.D4 + P.D5 + P.D6 - P.D7 - P.D8 - P.D9 - P.D10
Gregariousness = P.E1 + P.E2 + P.E3 + P.E4 + P.E5 + P.E6 - P.E7 - P.E8 - P.E9 - P.E10
Dutifulness = P.F1 + P.F2 + P.F3 + P.F4 + P.F5 - P.F6 - P.F7 - P.F8 - P.F9 - P.F10
Friendliness = P.G1 + P.G2 + P.G3 + P.G4 + P.G5 - P.G6 - P.G7 - P.G8 - P.G9 - P.G10
Sensitivity = P.H1 + P.H2 + P.H3 + P.H4 + P.H5 + P.H6 - P.H7 - P.H8 - P.H9 - P.H10
Distrust = P.I1 + P.I2 + P.I3 + P.I4 + P.I5 - P.I6 - P.I7 - P.I8 - P.I9 - P.I10
Imagination = P.J1 + P.J2 + P.J3 + P.J4 + P.J5 + P.J6 + P.J7 - P.J8 - P.J9 - P.J10
Reserve = P.K1 + P.K2 + P.K3 + P.K4 + P.K5 - P.K6 - P.K7 - P.K8 - P.K9 - P.K10
Anxiety = P.L1 + P.L2 + P.L3 + P.L4 + P.L5 + P.L6 + P.L7 - P.L8 - P.L9 - P.L10
Complexity = P.M1 + P.M2 + P.M3 + P.M4 + P.M5 - P.M6 - P.M7 - P.M8 - P.M9 - P.M10
Introversion = P.N1 + P.N2 + P.N3 + P.N4 + P.N5 + P.N6 + P.N7 - P.N8 - P.N9 - P.N10
Orderliness = P.O1 + P.O2 + P.O3 + P.O4 + P.O5 - P.O6 - P.O7 - P.O8 - P.O9 - P.O10
Emotionality = P.P1 + P.P2 + P.P3 + P.P4 + P.P5 + P.P6 + P.P7 - P.P8 - P.P9 - P.P10

Personality['Warmth'] = Warmth
Personality['Intellect'] = Intellect
Personality['Stability'] = Stability
Personality['Assertiveness'] = Assertiveness
Personality['Gregariousness'] = Gregariousness
Personality['Dutifulness'] = Dutifulness
Personality['Friendliness'] = Friendliness
Personality['Sensitivity'] = Sensitivity
Personality['Distrust'] = Distrust
Personality['Imagination'] = Imagination
Personality['Reserve'] = Reserve
Personality['Anxiety'] = Anxiety
Personality['Complexity'] = Complexity
Personality['Introversion'] = Introversion
Personality['Orderliness'] = Orderliness
Personality['Emotionality'] = Emotionality

In [10]:
Personality.shape


(34798, 185)

In [11]:
Personality

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,C1,C2,...,P8,P9,P10,age,gender,accuracy,country,source,elapsed,Warmth,Intellect,Stability,Assertiveness,Gregariousness,Dutifulness,Friendliness,Sensitivity,Distrust,Imagination,Reserve,Anxiety,Complexity,Introversion,Orderliness,Emotionality
0,1,4,2,3,3,2,3,4,4,3,4,4,5,4,5,4,5,4,1,2,1,1,1,4,5,...,2,5,2,17,1,92,US,6,914,7,29,3,13,7,-9,-1,14,2,31,-4,15,12,19,0,20
1,4,3,4,3,4,4,4,4,2,2,4,4,4,4,5,4,3,2,3,2,4,1,1,1,2,...,2,2,2,37,1,100,US,1,891,18,19,-8,13,5,2,4,14,7,21,3,15,9,23,-1,16
3,4,5,4,4,4,3,3,2,2,2,4,2,4,5,4,5,4,4,3,3,3,2,4,3,2,...,3,4,4,32,1,93,US,1,806,21,17,1,4,10,-4,3,9,2,30,2,15,7,27,-11,8
4,4,0,4,4,4,3,5,1,2,4,2,4,4,5,5,4,4,5,4,1,5,1,2,2,4,...,2,3,2,46,2,87,NZ,1,1826,17,20,-1,19,-2,6,7,10,3,9,4,19,11,18,8,18
7,4,5,4,4,4,4,5,2,2,2,3,3,4,5,4,4,4,5,4,2,3,1,1,3,4,...,4,4,3,61,2,100,US,1,567,24,21,5,12,1,6,0,17,-1,4,0,19,12,20,2,4
9,5,5,3,3,5,4,4,3,1,3,4,4,5,5,5,3,4,5,2,1,2,1,2,4,4,...,4,4,3,19,1,95,US,6,1835,22,27,12,11,15,12,10,6,-1,14,0,6,7,14,9,3
10,5,5,4,5,5,5,4,5,2,3,4,3,5,5,4,4,5,4,5,2,2,4,2,2,5,...,3,4,5,21,1,95,US,6,770,23,19,11,8,13,10,3,4,2,12,3,18,2,19,10,13
11,5,4,4,4,5,5,5,3,2,1,4,3,5,5,4,3,4,4,5,2,3,3,4,3,5,...,4,4,4,23,2,82,US,6,922,26,15,13,18,17,6,13,6,-6,12,-10,7,6,10,-1,2
12,5,4,5,4,4,4,3,3,2,4,3,3,4,3,4,4,2,4,1,2,1,1,1,3,5,...,2,2,2,17,2,90,US,1,431,20,21,5,20,6,-2,12,18,5,21,-16,13,3,10,8,19
13,5,4,5,4,4,4,3,1,2,4,4,5,5,5,5,4,5,4,3,5,2,3,2,3,4,...,3,5,5,24,2,90,TR,1,932,22,22,-1,11,9,-8,5,10,-3,10,-13,26,13,22,0,17


In [12]:
P

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,C1,C2,...,P8,P9,P10,age,gender,accuracy,country,source,elapsed,Warmth,Intellect,Stability,Assertiveness,Gregariousness,Dutifulness,Friendliness,Sensitivity,Distrust,Imagination,Reserve,Anxiety,Complexity,Introversion,Orderliness,Emotionality
0,1,4,2,3,3,2,3,4,4,3,4,4,5,4,5,4,5,4,1,2,1,1,1,4,5,...,2,5,2,17,1,92,US,6,914,7,29,3,13,7,-9,-1,14,2,31,-4,15,12,19,0,20
1,4,3,4,3,4,4,4,4,2,2,4,4,4,4,5,4,3,2,3,2,4,1,1,1,2,...,2,2,2,37,1,100,US,1,891,18,19,-8,13,5,2,4,14,7,21,3,15,9,23,-1,16
3,4,5,4,4,4,3,3,2,2,2,4,2,4,5,4,5,4,4,3,3,3,2,4,3,2,...,3,4,4,32,1,93,US,1,806,21,17,1,4,10,-4,3,9,2,30,2,15,7,27,-11,8
4,4,0,4,4,4,3,5,1,2,4,2,4,4,5,5,4,4,5,4,1,5,1,2,2,4,...,2,3,2,46,2,87,NZ,1,1826,17,20,-1,19,-2,6,7,10,3,9,4,19,11,18,8,18
7,4,5,4,4,4,4,5,2,2,2,3,3,4,5,4,4,4,5,4,2,3,1,1,3,4,...,4,4,3,61,2,100,US,1,567,24,21,5,12,1,6,0,17,-1,4,0,19,12,20,2,4
9,5,5,3,3,5,4,4,3,1,3,4,4,5,5,5,3,4,5,2,1,2,1,2,4,4,...,4,4,3,19,1,95,US,6,1835,22,27,12,11,15,12,10,6,-1,14,0,6,7,14,9,3
10,5,5,4,5,5,5,4,5,2,3,4,3,5,5,4,4,5,4,5,2,2,4,2,2,5,...,3,4,5,21,1,95,US,6,770,23,19,11,8,13,10,3,4,2,12,3,18,2,19,10,13
11,5,4,4,4,5,5,5,3,2,1,4,3,5,5,4,3,4,4,5,2,3,3,4,3,5,...,4,4,4,23,2,82,US,6,922,26,15,13,18,17,6,13,6,-6,12,-10,7,6,10,-1,2
12,5,4,5,4,4,4,3,3,2,4,3,3,4,3,4,4,2,4,1,2,1,1,1,3,5,...,2,2,2,17,2,90,US,1,431,20,21,5,20,6,-2,12,18,5,21,-16,13,3,10,8,19
13,5,4,5,4,4,4,3,1,2,4,4,5,5,5,5,4,5,4,3,5,2,3,2,3,4,...,3,5,5,24,2,90,TR,1,932,22,22,-1,11,9,-8,5,10,-3,10,-13,26,13,22,0,17


In [9]:
p = Personality.shape[1]  # number of columns

PersonalityScores = Personality.iloc[:, (p-16):]  # the last 16 columns of the Personality data frame

k=3   #  as an example to get you started, assume there are 3 different personality types (classes)

import sklearn
from sklearn import cluster

kMeans = cluster.KMeans( n_clusters = k , n_init = 50, max_iter = 300 )  # 50 random restarts

kMeans.fit( PersonalityScores )

print( kMeans.inertia_ )

print( kMeans.cluster_centers_ )

19774658.8827
[[ 19.91725601  15.26536225   1.31926764  10.05657709  10.5948452
    2.05987742   2.95426686  10.05029074   0.35643564  16.33765519
   -2.19668395  16.52176646   6.00589345  13.32123212   0.99300644
   11.96188905]
 [ 14.17129962  18.93445482  -1.4842548    8.3888604    3.93322392
   -1.14678429  -7.64119397  11.47430506   4.86901221  19.66242692
    7.77105344  16.81064725   8.23233152  22.2388963    0.56723767
   13.48158786]
 [ 22.5285239   22.20335957  10.06556845  15.14355271  12.21025724
    3.25115637   8.87503043  12.19516352  -2.82423111  15.61015986
   -5.70396819   7.56812464  10.56674511  12.62582163   3.41913495
    5.56877384]]


In [11]:
from sklearn import cluster
P=PersonalityScores.as_matrix()

In [12]:
P.shape

(34798, 16)

In [35]:

def gap_statistic(P,k):
    centroids, labels, inertia = cluster.k_means(P, n_clusters=k)
    cluster_sizes = [ labels.tolist().count(j) for j in range(k) ]
    n, p = P.shape
    gap_value = sum( [ sum((P[i,:] - centroids[labels[i]])**2)/cluster_sizes[labels[i]] for i in range(n)] ) / 2.0
    return(gap_value)

best_gap = 1
best_k = 1
for k in range(2, 20):
    gap_value = gap_statistic(P,k)
    print( 'gap value for %d clusters is %6.3f' % (k, gap_value) )
    if best_gap > gap_value:
       best_gap = gap_value
       best_k = k

print( 'best value is %6.3f, for %d clusters' % (best_gap, best_k) )
#from the result we can see that we cannot find the label using k-means method 

iris dataset gap value for 2 clusters is 621.220
iris dataset gap value for 3 clusters is 864.944
iris dataset gap value for 4 clusters is 1077.417
iris dataset gap value for 5 clusters is 1290.048
iris dataset gap value for 6 clusters is 1488.533
iris dataset gap value for 7 clusters is 1674.626
iris dataset gap value for 8 clusters is 1866.128
iris dataset gap value for 9 clusters is 2048.169
iris dataset gap value for 10 clusters is 2232.048
iris dataset gap value for 11 clusters is 2412.293
iris dataset gap value for 12 clusters is 2590.715
iris dataset gap value for 13 clusters is 2766.663
iris dataset gap value for 14 clusters is 2930.078
iris dataset gap value for 15 clusters is 3110.147
iris dataset gap value for 16 clusters is 3283.711
iris dataset gap value for 17 clusters is 3460.104
iris dataset gap value for 18 clusters is 3625.836
iris dataset gap value for 19 clusters is 3788.286
best value is  1.000, for 1 clusters


In [40]:
cluster.k_means(P, n_clusters=2)

(array([[ 21.9303317 ,  19.99890818,   7.60257877,  13.81501508,
          12.14557554,   2.89211812,   7.73863991,  11.3881148 ,
          -1.97769575,  15.83534366,  -5.15904128,  10.2964542 ,
           9.13866071,  12.38203182,   2.64755121,   7.53311844],
        [ 15.89784117,  17.20714469,  -1.27672835,   8.39494988,
           5.7847597 ,  -0.03398869,  -4.90722179,  10.98721408,
           3.54921614,  18.46498329,   4.9308661 ,  17.30679774,
           7.14013107,  19.51715497,   0.60254433,  13.32504498]]),
 array([1, 1, 1, ..., 1, 0, 0], dtype=int32),
 21414717.793149646)

In [41]:
cluster.k_means(P, n_clusters=10)

(array([[ 17.42873887,  11.60806376,  -0.02508204,   6.58110642,
           6.82794187,   4.5302391 ,  -2.33052039,   8.33309892,
           0.40600094,  13.61837787,   1.45850914,  17.64861697,
           2.32231599,  14.70253165,   1.73511486,  12.42897328],
        [ 25.07796102,  23.56131934,  12.62068966,  16.58590705,
          13.92173913,   7.33733133,  12.44947526,  13.37631184,
          -5.50494753,  12.96551724,  -9.27286357,   5.15562219,
          11.59670165,   9.90224888,   6.03808096,   2.48635682],
        [ 23.4260184 ,  18.93219448,   0.90407359,  12.71931669,
          13.57950066,   3.00814717,   8.11563732,  13.73272011,
          -0.58002628,  17.72641261,  -7.98896189,  18.70906702,
           9.26544021,  11.75321945,   2.58107753,  12.28725361],
        [ 19.90630048,  20.73532579,   5.08373721,   9.27517501,
           5.32148627,   2.9084545 ,  -3.37102854,  13.89310716,
          -0.19709208,  19.07673667,   4.11308562,  13.49084545,
          10.4189553 ,

In [14]:
centroids, labels, inertia=cluster.k_means(P, n_clusters=20)

In [47]:
best_inertia = 21414717.793149646
best_k = 1
for k in range(2, 21):
    centroids, labels, inertia = cluster.k_means(P, n_clusters=k)
    print( 'inertia value for %d clusters is %6.3f' % (k, inertia) )
    if best_gap > inertia:
       best_gap = inertia
       best_k = k

inertia value for 2 clusters is 21414705.487
inertia value for 3 clusters is 19774727.499
inertia value for 4 clusters is 18507813.348
inertia value for 5 clusters is 17499229.870
inertia value for 6 clusters is 16851574.094
inertia value for 7 clusters is 16284168.011
inertia value for 8 clusters is 15860198.203
inertia value for 9 clusters is 15494154.103
inertia value for 10 clusters is 15171656.535
inertia value for 11 clusters is 14893247.374
inertia value for 12 clusters is 14648397.690
inertia value for 13 clusters is 14431203.524
inertia value for 14 clusters is 14216467.087
inertia value for 15 clusters is 14041939.050
inertia value for 16 clusters is 13885510.774
inertia value for 17 clusters is 13739796.471
inertia value for 18 clusters is 13601563.082
inertia value for 19 clusters is 13473230.311
inertia value for 20 clusters is 13349812.308


In [16]:
'''
inertia value for 2 clusters is 21414705.487
inertia value for 3 clusters is 19774727.499
inertia value for 4 clusters is 18507813.348
inertia value for 5 clusters is 17499229.870
inertia value for 6 clusters is 16851574.094
inertia value for 7 clusters is 16284168.011
inertia value for 8 clusters is 15860198.203
inertia value for 9 clusters is 15494154.103
inertia value for 10 clusters is 15171656.535
inertia value for 11 clusters is 14893247.374
inertia value for 12 clusters is 14648397.690
inertia value for 13 clusters is 14431203.524
inertia value for 14 clusters is 14216467.087
inertia value for 15 clusters is 14041939.050
inertia value for 16 clusters is 13885510.774
inertia value for 17 clusters is 13739796.471
inertia value for 18 clusters is 13601563.082
inertia value for 19 clusters is 13473230.311
inertia value for 20 clusters is 13349812.308

the best labels are when k=20
'''

PersonalityClassLabels=labels
PersonalityCluster=20

In [24]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(random_state=0)
scores = cross_validation.cross_val_score(DTC, P, PersonalityClassLabels, cv=10)
print(scores.min(), scores.mean(), scores.max())

(0.55259386643737463, 0.56954717162195068, 0.58985382631126393)


In [21]:
from sklearn.qda import QDA
from sklearn import cross_validation

clf_qda.fit(P, PersonalityClassLabels)
#scores = cross_val_score(clf_qda, P, PersonalityClassLabels, cv=10, n_jobs=4,
 #                        scoring='accuracy', )
#print(scores.min(), scores.mean(), scores.max())
#cvKF = cross_validation.KFold(len(communications.target), n_folds=3, shuffle=True)
clf_qda = QDA()
scores = cross_validation.cross_val_score(clf_qda, P, PersonalityClassLabels, cv=10)
print(scores.min(), scores.mean(), scores.max())

(0.83548294640298082, 0.84623316879787835, 0.85664939550949915)


In [23]:
from sklearn.lda import LDA
clf_lda = LDA()
clf_lda.fit(P, PersonalityClassLabels)


scores = cross_validation.cross_val_score(clf_lda, P, PersonalityClassLabels, cv=10)
print(scores.min(), scores.mean(), scores.max())

(0.91430209229005444, 0.92083162674335828, 0.92807825086306095)


In [26]:
from sklearn.neighbors import NearestNeighbors

example_number_of_neighbors = 4

kNN = NearestNeighbors( n_neighbors = example_number_of_neighbors, algorithm='ball_tree')

#kNN.fit( PersonalityScores )  # unsupervised version!!
kNN.fit( PersonalityScores, PersonalityClassLabels )  # supervised version

distances, indices = kNN.kneighbors( PersonalityScores )
scores = cross_validation.cross_val_score(kNN, P, PersonalityClassLabels, cv=10)
print(scores.min(), scores.mean(), scores.max())


In [None]:
from sklearn import svm
clf_svm = svm.SVC()
scores = cross_validation.cross_val_score(clf_svm, P, PersonalityClassLabels, cv=10,scoring='accuracy')
print(scores.min(), scores.mean(), scores.max())
