In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df1 = pd.read_pickle('namus_html1.pkl')
df2 = pd.read_pickle('namus_html2.pkl')

In [3]:
dfs = [df1, df2]
namus = pd.concat(dfs)
print(namus.shape)

(10529, 72)


In [4]:
namus['rating'] = namus.case_rating.map({'Extremely Low':0, 'Low':1, 'Low - Medium': 2,'Medium':3, 'Medium - High':4,
                                        'High':5, })

In [5]:
namus.amputations_description.unique()

array(['NA',
       'remote amputations of left left middle and ring finger at middle joint.',
       'Remote amputation of the left little (pinkie) finger',
       'The left index finger had been remotely amputated distal to the metacarpal, phalangeal joint. Well healed',
       'The left hand second finger is amputaed at the PIP joint, the amputation appeard very old.',
       'Amputation of phalanges of second digit of right hand',
       'The 2nd through 5th toes of her left foot appear to have been missing long before death.  It is unknown if this was a congenital condition, or if she suffered some kind of injury requiring surgery.  But, the toes were missing before she was murdered and w',
       'Both feet had been surgically amputated.  He wore specially designed shoes to accomodate this disability.',
       'Healed remote amputation of tip of left 5th finger', 'Left leg',
       'Remote amputation of 3rd and 4th fingers on left hand', 'finger',
       'Toe, unspecific',
      

In [6]:
namus[namus.other_items_with_body!= 'NA'].rating.value_counts()

3    3868
4    3396
2    1312
1     962
0     526
5     463
Name: rating, dtype: int64

In [7]:
namus.hair_color.unique()

array(['', 'Brown', 'Unknown or Completely Bald', 'Gray or Partially Gray',
       'Black', 'Blond/Strawberry', 'Red/Auburn', 'White', 'Sandy',
       'Purple'], dtype=object)

In [8]:
namus.head_hair.unique()

array(['', 'brown-black, appears to be styled in "corn rolls"',
       'Light brownish-blonde, 11"', ..., 'Light brown.',
       '1-2" red hair located with the remains',
       'Short hair, possible a "buzz cut"'], dtype=object)

In [9]:
namus.body_hair.unique()

array(['', 'Light brownish-blonde pubic hair', 'gray', 'n/a',
       'straight, reddish brown', 'black', 'brown', 'reddish-brown', 'N/A',
       'Curly gray', 'Pubic hair- light brown and curly.', 'Brown',
       'Black', 'None - Skeletal Remains', 'Unknown',
       'Black-grey pubic hair.', 'unknown', 'BLACK', 'Lost to decomposed',
       'not listed', 'blonde appearing', 'black pubic hair',
       'Dark brown pubic hair.', 'He had very little chest hair.',
       'Skin is hairy with very light hair.', 'Black.',
       'Body hair appeared to be brown or somewhat lighter.',
       'Black, abundant', 'unremarkable', 'hairy chest and arms',
       'pubic hair dark with sonme gray', 'dark brown hair in underarms',
       'black, tightly curled, gray highlights, 0.5 inches in length',
       'Indiscernible', 'dark black heavy curled',
       'Tatoo posterior left shoulder"ZDM" in diamond shape, anterior left shoulder Tatoo of snake with womans head and words "ASI Erestu" above womans head.

In [10]:
namus.facial_hair.unique()

array(['', 'N/A', 'gray, shaggy beard and moustache', ...,
       'Unshaven stubble', 'unknown/skeletal',
       'Unknown/information not available'], dtype=object)

In [11]:
namus[namus.sex != 'NA'].rating.value_counts()

3    3870
4    3396
2    1312
1     962
0     526
5     463
Name: rating, dtype: int64

## Let's try to define a basic model

In [12]:
nlp_features = ['scars_and_marks_description',
             'skeletal_findings_description',
             'clothing_on_body',
             'clothing_with_body',
             'footwear',
             'jewelry',
             'eyewear',
             'other_items_with_body',
             'hair_color',
             'head_hair',
             'body_hair',
             'facial_hair',
             'eye_description',
             'race',
             'sex',
             'circumstances']

In [13]:
# define X and y using the original DataFrame
X = namus.scars_and_marks_description
y = namus.rating

In [14]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# import and instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [16]:
# fit and transform X_train, but only transform X_test
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [17]:
print(X_test_dtm.shape)
print(X_train_dtm.shape)

(2633, 1236)
(7896, 1236)


In [18]:
# import/instantiate/fit
from sklearn.naive_bayes import MultinomialNB, GaussianNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
# fit a Naive Bayes model
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# make class predictions
y_pred_class = nb.predict(X_test_dtm)

In [21]:
from sklearn import metrics

# calculate the testing accuary
print(metrics.accuracy_score(y_test, y_pred_class))

0.376756551462


In [114]:
def nlp_accuracy(feature_col):
    X = namus[feature_col]
    y = namus.rating
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    return metrics.confusion_matrix(y_test, y_pred_class), metrics.accuracy_score(y_test, y_pred_class)

In [115]:
nlp_accuracy('eyewear')

(array([[  0,   0,   0, 131,   1,   0],
        [  0,   0,   0, 254,   0,   0],
        [  0,   0,   0, 313,   4,   0],
        [  0,   0,   0, 962,   8,   0],
        [  0,   0,   0, 837,   6,   0],
        [  0,   0,   0, 115,   2,   0]]), 0.3676414736042537)

In [132]:
nlp_accuracy('sex') # -> BETTER AS BINARY (M/F = 1; UNSURE/'' = 0)

(array([[  0,   0,   0, 132,   0,   0],
        [  0,   0,   0, 254,   0,   0],
        [  0,   0,   0, 317,   0,   0],
        [  0,   0,   0, 970,   0,   0],
        [  0,   0,   0, 843,   0,   0],
        [  0,   0,   0, 117,   0,   0]]), 0.36840106342575007)

In [117]:
nlp_accuracy('other_items_with_body')

(array([[  0,   0,   0, 129,   3,   0],
        [  0,   1,   0, 247,   6,   0],
        [  0,   1,   0, 297,  19,   0],
        [  0,   0,   5, 909,  56,   0],
        [  0,   0,   0, 781,  62,   0],
        [  0,   0,   0,  92,  25,   0]]), 0.36916065324724651)

In [118]:
nlp_accuracy('jewelry')

(array([[  0,   0,   0, 121,  11,   0],
        [  0,   0,   0, 252,   2,   0],
        [  0,   0,   0, 296,  21,   0],
        [  0,   0,   1, 916,  53,   0],
        [  0,   0,   0, 783,  60,   0],
        [  0,   0,   1, 103,  13,   0]]), 0.37067983289023926)

In [119]:
nlp_accuracy('clothing_on_body')

(array([[  0,   0,   1, 119,  12,   0],
        [  0,   3,   0, 245,   6,   0],
        [  0,   0,   5, 265,  47,   0],
        [  1,   4,   4, 823, 138,   0],
        [  0,   0,   3, 690, 143,   7],
        [  0,   0,   0,  77,  35,   5]]), 0.37181921762248388)

In [120]:
nlp_accuracy('footwear')

(array([[  0,   0,   0, 128,   4,   0],
        [  0,   0,   0, 251,   3,   0],
        [  0,   1,   0, 288,  28,   0],
        [  0,   0,   4, 867,  99,   0],
        [  0,   0,   1, 726, 116,   0],
        [  0,   1,   0,  90,  24,   2]]), 0.37409798708697306)

In [121]:
nlp_accuracy('scars_and_marks_description')

(array([[  0,   0,   4, 126,   1,   1],
        [  0,   0,   4, 250,   0,   0],
        [  0,   0,  68, 243,   4,   2],
        [  2,   0,  39, 898,  15,  16],
        [  0,   0,  36, 784,  14,   9],
        [  1,   0,   8,  95,   1,  12]]), 0.37675655146221043)

In [122]:
nlp_accuracy('skeletal_findings_description')

(array([[  0,   0,   0, 128,   4,   0],
        [  0,   0,   0, 248,   6,   0],
        [  0,   1,   1, 303,  12,   0],
        [  0,   0,   1, 853, 115,   1],
        [  0,   1,   0, 690, 151,   1],
        [  0,   0,   0, 100,  17,   0]]), 0.38169388530193693)

In [123]:
nlp_accuracy('clothing_with_body')

(array([[  0,   0,   0, 125,   7,   0],
        [  0,   0,   0, 252,   2,   0],
        [  0,   0,   0, 290,  27,   0],
        [  1,   0,   2, 898,  69,   0],
        [  0,   0,   1, 728, 114,   0],
        [  0,   0,   0, 104,  13,   0]]), 0.3843524496771743)

In [124]:
nlp_accuracy('hair_color') # -> this should be binarized!

(array([[  0,   0,   0, 111,  21,   0],
        [  0,   0,   0, 168,  86,   0],
        [  0,   0,   0, 264,  53,   0],
        [  0,   0,   0, 732, 238,   0],
        [  0,   0,   0, 561, 282,   0],
        [  0,   0,   0, 112,   5,   0]]), 0.38511203949867073)

In [125]:
nlp_accuracy('eye_description')

(array([[  0,   0,   0,  91,  41,   0],
        [  0,   2,   3, 212,  37,   0],
        [  0,   1,   7, 281,  28,   0],
        [  0,   7,   9, 806, 148,   0],
        [  0,   4,   7, 623, 209,   0],
        [  0,   0,   1,  96,  20,   0]]), 0.38890998860615267)

In [126]:
nlp_accuracy('head_hair')

(array([[ 14,   0,   1, 107,  10,   0],
        [ 12,   0,   5, 198,  39,   0],
        [  5,   1,  31, 221,  58,   1],
        [ 34,   0,  23, 771, 141,   1],
        [ 26,   1,  12, 581, 222,   1],
        [  0,   0,   2,  82,  27,   6]]), 0.39650588682111659)

In [127]:
nlp_accuracy('body_hair')

(array([[  0,   0,   1, 127,   4,   0],
        [  0,   1,   0, 248,   5,   0],
        [  0,   1,   0, 305,  11,   0],
        [  0,   2,   1, 912,  54,   1],
        [  0,   0,   0, 720, 122,   1],
        [  0,   0,   0, 100,  16,   1]]), 0.39346752753513103)

In [128]:
nlp_accuracy('facial_hair')

(array([[  0,   0,   1, 129,   2,   0],
        [  0,   3,   1, 244,   6,   0],
        [  0,   2,   5, 295,  15,   0],
        [  2,   0,   6, 900,  62,   0],
        [  0,   1,   2, 710, 130,   0],
        [  0,   0,   1, 101,  15,   0]]), 0.39422711735662741)

In [129]:
nlp_accuracy('race')

(array([[  0,   0,   0, 128,   4,   0],
        [  0,   0,   0, 244,  10,   0],
        [  0,   0,   0, 295,  22,   0],
        [  0,   0,   0, 887,  83,   0],
        [  0,   0,   0, 683, 160,   0],
        [  0,   0,   0,  82,  35,   0]]), 0.39764527155336121)

In [130]:
nlp_accuracy('circumstances')

(array([[ 21,   1,   0, 104,   6,   0],
        [  3,   6,  35, 152,  58,   0],
        [  4,   0, 131, 139,  43,   0],
        [  9,   8,  67, 605, 277,   4],
        [  5,   3,  39, 344, 452,   0],
        [  1,   1,   5,  56,  51,   3]]), 0.46259020129130268)

**Need a null model - need to know whether to potentially ensemble some of these features...Is this right?!**

In [53]:
import numpy as np
# define X and y using the original DataFrame
X = namus.circumstances
y = namus.rating

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.95556399544246107

**Lets make a joint feature document term matrix...**

In [165]:
# define X and y
features = ['circumstances', 'race', 'facial_hair', 'body_hair', 'head_hair', 'scars_and_marks_description']
X = namus[features]
y = namus.rating

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [166]:
print(X_train.shape)
print(X_test.shape)

(7896, 6)
(2633, 6)


In [167]:
vect = CountVectorizer()

In [139]:
headhair_train_dtm = vect.fit_transform(X_train.head_hair)
headhair_test_dtm = vect.transform(X_test.head_hair)

In [143]:
headhair_train_dtm.shape

(7896, 945)

In [145]:
circum_train_dtm = vect.fit_transform(X_train.circumstances)
circum_test_dtm = vect.transform(X_test.circumstances)

In [146]:
circum_train_dtm.shape

(7896, 11921)

In [149]:
import scipy as sp

# combine sparse matrices
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 12866)
(2633, 12866)


In [218]:
def extra_features(X_train_dtm, X_test_dtm, y_train):    
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    return metrics.confusion_matrix(y_test, y_pred_class), metrics.accuracy_score(y_test, y_pred_class)

In [153]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 24,   1,   0,  99,   8,   0],
        [  3,   8,  34, 149,  60,   0],
        [  4,   0, 132, 134,  46,   1],
        [ 10,   5,  55, 634, 262,   4],
        [  5,   2,  37, 345, 452,   2],
        [  1,   0,   6,  58,  46,   6]]), 0.47702240789973416)

In [168]:
bodyhair_train_dtm = vect.fit_transform(X_train.body_hair)
bodyhair_test_dtm = vect.transform(X_test.body_hair)

facialhair_train_dtm = vect.fit_transform(X_train.facial_hair)
facialhair_test_dtm = vect.transform(X_test.facial_hair)

race_train_dtm = vect.fit_transform(X_train.race)
race_test_dtm = vect.transform(X_test.race)

smarks_train_dtm = vect.fit_transform(X_train.scars_and_marks_description)
smarks_test_dtm = vect.transform(X_test.scars_and_marks_description)

In [155]:
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm, race_train_dtm, bodyhair_train_dtm, facialhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm, race_test_dtm, bodyhair_test_dtm, facialhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 13629)
(2633, 13629)


In [156]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 22,   1,   0, 101,   8,   0],
        [  2,   9,  33, 162,  48,   0],
        [  4,   1, 133, 127,  51,   1],
        [ 11,   6,  49, 648, 252,   4],
        [  4,   2,  38, 360, 435,   4],
        [  1,   0,   4,  60,  47,   5]]), 0.47550322825674135)

In [163]:
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 12866)
(2633, 12866)


In [164]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 24,   1,   0,  99,   8,   0],
        [  3,   8,  34, 149,  60,   0],
        [  4,   0, 132, 134,  46,   1],
        [ 10,   5,  55, 634, 262,   4],
        [  5,   2,  37, 345, 452,   2],
        [  1,   0,   6,  58,  46,   6]]), 0.47702240789973416)

In [169]:
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm, smarks_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm, smarks_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 14102)
(2633, 14102)


In [170]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 20,   1,   4,  98,   8,   1],
        [  2,   9,  32, 141,  70,   0],
        [  4,   0, 129, 129,  52,   3],
        [ 10,   7,  58, 602, 281,  12],
        [  6,   2,  47, 326, 459,   3],
        [  1,   0,  12,  47,  47,  10]]), 0.46676794530953286)

In [171]:
X_train_dtm

<7896x14102 sparse matrix of type '<class 'numpy.int64'>'
	with 189464 stored elements in COOrdinate format>

**Try with vect(stop words and max things)**

In [172]:
vect = CountVectorizer(stop_words='english')
headhair_train_dtm = vect.fit_transform(X_train.head_hair)
headhair_test_dtm = vect.transform(X_test.head_hair)
circum_train_dtm = vect.fit_transform(X_train.circumstances)
circum_test_dtm = vect.transform(X_test.circumstances)
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 12497)
(2633, 12497)


In [173]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 23,   1,   0,  99,   9,   0],
        [  2,   8,  34, 149,  61,   0],
        [  4,   1, 134, 131,  47,   0],
        [ 10,   9,  53, 633, 262,   3],
        [  4,   3,  40, 344, 452,   0],
        [  1,   0,   6,  58,  45,   7]]), 0.47740220281048235)

**BETTER!!**

In [178]:
vect = CountVectorizer(stop_words='english', max_features=7000)
headhair_train_dtm = vect.fit_transform(X_train.head_hair)
headhair_test_dtm = vect.transform(X_test.head_hair)
circum_train_dtm = vect.fit_transform(X_train.circumstances)
circum_test_dtm = vect.transform(X_test.circumstances)
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 7831)
(2633, 7831)


In [179]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 26,   2,   0,  95,   9,   0],
        [  3,  22,  38, 131,  60,   0],
        [  9,   1, 150, 117,  40,   0],
        [ 15,  22,  74, 592, 259,   8],
        [  6,   6,  50, 332, 446,   3],
        [  2,   0,   9,  50,  39,  17]]), 0.47588302316748954)

In [201]:
vect = CountVectorizer(max_df=0.7)
headhair_train_dtm = vect.fit_transform(X_train.head_hair)
headhair_test_dtm = vect.transform(X_test.head_hair)
circum_train_dtm = vect.fit_transform(X_train.circumstances)
circum_test_dtm = vect.transform(X_test.circumstances)
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 12866)
(2633, 12866)


In [219]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 24,   1,   0,  99,   8,   0],
        [  3,   8,  34, 149,  60,   0],
        [  4,   0, 132, 134,  46,   1],
        [ 10,   5,  55, 634, 262,   4],
        [  5,   2,  37, 345, 452,   2],
        [  1,   0,   6,  58,  46,   6]]), 0.47702240789973416)

In [205]:
namus.state.unique()

array(['Georgia', 'Nevada', 'Florida', 'Arizona', 'Kentucky', 'New York',
       'Ohio', 'Tennessee', 'Minnesota', 'California', 'Washington',
       'Illinois', 'Texas', 'Pennsylvania', 'Colorado', 'Michigan',
       'Alabama', 'New Jersey', 'Louisiana', 'Indiana', 'Virginia',
       'Mississippi', 'District of Columbia', 'Maryland', 'New Mexico',
       'North Dakota', 'Oregon', 'North Carolina', 'Connecticut', 'Hawaii',
       'West Virginia', 'Delaware', 'New Hampshire', 'Kansas', 'Iowa',
       'Puerto Rico', 'South Carolina', 'Arkansas', 'Oklahoma', 'Idaho',
       'Wisconsin', 'Missouri', 'Utah', 'Massachusetts', 'Alaska',
       'Montana', 'Maine', 'Wyoming', 'Nebraska', 'Guam', 'South Dakota',
       'Vermont', 'Rhode Island', ''], dtype=object)

In [558]:
namus.hair_color.unique()

array(['', 'Brown', 'Unknown or Completely Bald', 'Gray or Partially Gray',
       'Black', 'Blond/Strawberry', 'Red/Auburn', 'White', 'Sandy',
       'Purple'], dtype=object)

## new dtm using items NamUs says they use for 2-star ratings

In [None]:
namus.tattoos_description

In [222]:
# define X and y
features2 = ['clothing_on_body', 'clothing_with_body', 'footwear', 'eyewear', 'jewelry', 'tattoos_description', 'scars_and_marks_description']
X = namus[features2]
y = namus.rating

# split into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=1)

In [223]:
vect = CountVectorizer(stop_words='english')

clothing1_train_dtm = vect.fit_transform(X_train2.clothing_on_body)
clothing1_test_dtm = vect.transform(X_test2.clothing_on_body)

clothing2_train_dtm = vect.fit_transform(X_train2.clothing_with_body)
clothing2_test_drm = vect.transform(X_test2.clothing_with_body)

tattoos_train_dtm = vect.fit_transform(X_train2.tattoos_description)
tattoos_test_dtm = vect.transform(X_test2.tattoos_description)

footwear_train_dtm = vect.fit_transform(X_train2.footwear)
footwear_test_dtm = vect.transform(X_test2.footwear)

eyewear_train_dtm = vect.fit_transform(X_train2.eyewear)
eyewear_test_dtm = vect.transform(X_test2.eyewear)

jewelry_train_dtm = vect.fit_transform(X_train2.jewelry)
jewelry_test_dtm = vect.transform(X_test2.jewelry)

smarks_train_dtm = vect.fit_transform(X_train2.scars_and_marks_description)
smarks_test_dtm = vect.transform(X_test2.scars_and_marks_description)


X_train_dtm = sp.sparse.hstack((clothing1_train_dtm, clothing2_train_dtm, tattoos_train_dtm, footwear_train_dtm, eyewear_train_dtm, jewelry_train_dtm, smarks_train_dtm))
X_test_dtm = sp.sparse.hstack((clothing1_test_dtm, clothing2_test_drm, tattoos_test_dtm, footwear_test_dtm, eyewear_test_dtm, jewelry_test_dtm, smarks_test_dtm))

print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 14316)
(2633, 14316)


In [224]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[  0,   1,   4, 111,  16,   0],
        [  0,   0,   1, 245,   8,   0],
        [  0,   0,  48, 213,  56,   0],
        [  0,   3,  29, 744, 192,   2],
        [  0,   0,  19, 585, 238,   1],
        [  0,   0,   9,  78,  27,   3]]), 0.39232814280288641)

##Using OneVsRest on Multinomial Naive Bayes

In [225]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

In [226]:
def extra_features2(X_train_dtm, X_test_dtm, y_train):    
    nb = OneVsRestClassifier(MultinomialNB())
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    return metrics.confusion_matrix(y_test, y_pred_class), metrics.accuracy_score(y_test, y_pred_class)

In [227]:
extra_features2(X_train_dtm, X_test_dtm, y_train)

(array([[  0,   1,   4, 110,  17,   0],
        [  0,   0,   1, 245,   8,   0],
        [  0,   0,  54, 205,  58,   0],
        [  0,   2,  27, 745, 192,   4],
        [  0,   0,  21, 578, 240,   4],
        [  0,   0,  10,  77,  26,   4]]), 0.39612609191036841)

In [228]:
# define X and y
features = ['circumstances', 'race', 'facial_hair', 'body_hair', 'head_hair', 'scars_and_marks_description']
X = namus[features]
y = namus.rating

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [229]:
vect = CountVectorizer(stop_words='english')
headhair_train_dtm = vect.fit_transform(X_train.head_hair)
headhair_test_dtm = vect.transform(X_test.head_hair)
circum_train_dtm = vect.fit_transform(X_train.circumstances)
circum_test_dtm = vect.transform(X_test.circumstances)
X_train_dtm = sp.sparse.hstack((circum_train_dtm, headhair_train_dtm))
X_test_dtm = sp.sparse.hstack((circum_test_dtm, headhair_test_dtm))
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 12497)
(2633, 12497)


In [230]:
extra_features2(X_train_dtm, X_test_dtm, y_train)

(array([[ 23,   2,   0,  98,   9,   0],
        [  3,  12,  37, 135,  67,   0],
        [  7,   1, 143, 120,  46,   0],
        [ 12,  10,  64, 607, 273,   4],
        [  5,   5,  47, 326, 460,   0],
        [  2,   0,   5,  58,  42,  10]]), 0.47664261298898597)

In [231]:
extra_features(X_train_dtm, X_test_dtm, y_train)

(array([[ 23,   1,   0,  99,   9,   0],
        [  2,   8,  34, 149,  61,   0],
        [  4,   1, 134, 131,  47,   0],
        [ 10,   9,  53, 633, 262,   3],
        [  4,   3,  40, 344, 452,   0],
        [  1,   0,   6,  58,  45,   7]]), 0.47740220281048235)

In [232]:
X_train_dtm

<7896x12497 sparse matrix of type '<class 'numpy.int64'>'
	with 112688 stored elements in COOrdinate format>

###Try again but removing proper nouns and numericals:

In [416]:
from nltk.tag import pos_tag

def remove_ppr_nouns(text):
    tagged = pos_tag(text.split())
    print(tagged)

    return ' '.join([word for word, pos in tagged if pos != 'NNP' and pos != 'CD'])

In [None]:
namus['circumstances_ppn'] = namus.circumstances.apply(remove_ppr_nouns)

In [None]:
namus['head_hair_ppn'] = namus.head_hair.apply(remove_ppr_nouns)

In [None]:
namus['facial_hair_ppn'] = namus.facial_hair.apply(remove_ppr_nouns)

In [None]:
namus['body_hair_ppn'] = namus.body_hair.apply(remove_ppr_nouns)

In [None]:
namus['scars_and_marks_ppn'] = namus.scars_and_marks_description.apply(remove_ppr_nouns)

In [461]:
# define X and y
features = ['circumstances_ppn', 'race', 'facial_hair_ppn', 'body_hair_ppn', 'head_hair_ppn', 'scars_and_marks_ppn']
X = namus[features]
y = namus.rating

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [462]:
vect = CountVectorizer(stop_words='english')
headhair_train_dtm = vect.fit_transform(X_train.head_hair_ppn)
headhair_test_dtm = vect.transform(X_test.head_hair_ppn)
circum_train_dtm = vect.fit_transform(X_train.circumstances_ppn)
circum_test_dtm = vect.transform(X_test.circumstances_ppn)
X_train_dtm = sp.sparse.hstack([circum_train_dtm, headhair_train_dtm])
X_test_dtm = sp.sparse.hstack([circum_test_dtm, headhair_test_dtm])
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7896, 7124)
(2633, 7124)


In [466]:
extra_features2(X_train_dtm, X_test_dtm, y_train)

(array([[ 20,   1,   2, 101,   8,   0],
        [  0,  20,  32, 132,  70,   0],
        [  5,   1, 125, 133,  53,   0],
        [  4,  20,  62, 623, 258,   3],
        [  3,   5,  37, 361, 437,   0],
        [  1,   0,   5,  61,  46,   4]]), 0.46676794530953286)

##Try to figure out how to add dtm to namus df

In [443]:
# define X and y
features = ['circumstances_ppn', 'head_hair_ppn']
X = namus[features]
y = namus.rating

In [555]:
from textblob import TextBlob, Word

# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
    text = text.lower()
    text = TextBlob(text)
    text = text.correct()
    words = text.words
    return [word.lemmatize('v') for word in words]

def split_into_stems(text):
    words = TextBlob(text).words
    return [stemmer.stem(word) for word in words] 

In [533]:
text = (' this is so the climnb climber climbing thing everrrr things thing is was an apple')

In [509]:
vect = CountVectorizer(analyzer=split_into_lemmas)
text_dtm = vect.fit_transform(text)

In [556]:
t = split_into_lemmas(text)

In [557]:
print(t)

['this', 'be', 'so', 'the', 'climb', 'climber', 'climb', 'thing', 'everrrr', 'things', 'thing', 'be', 'be', 'an', 'apple']


In [546]:
t = TextBlob(text)

In [547]:
t.correct()

TextBlob(" this is so the climb climber climbing thing everrrr things thing is was an apple")

In [476]:
vect = CountVectorizer(stop_words='english', analyzer=split_into_lemmas)
headhair_dtm = vect.fit_transform(X.head_hair_ppn)
circum_dtm = vect.fit_transform(X.circumstances_ppn)
#X_dtm = sp.sparse.hstack([circum_dtm, headhair_dtm])
#print(X_dtm.shape)

In [477]:
print(vect.get_feature_names())

["'5", "'92", "'bb", "'beth", "'d", "'down", "'its", "'jane", "'jorge", "'norma", "'pops", "'roland", "'s", "'this", '0.06', '0.28', '0.43', '011-mdr', '01152', '02172002', '0300', '06-8703', '07-0243', '0ccurred', '0cean', '0fficer', '0nc0ming', '0nly', '0riginally', '0rth', '1', '10', '100', '10th', '11/11/06', '11/2001', '116th', '11th', '12', '12-inch', '12/19/1957', '125th', '12886', '129th', '12th', '13-10', '1300', '135', '135th', '136th', '139th', '140', '145th', '14th', '1513', '1515', '153rd', '155th', '158th', '15th', '160th', '163rd', '165', '168th', '1745', '179th', '17th', '183rd', '188th', '18th', '19-21', '1900', '190th', '1968', "1970's/early", '1975', '1998-2005/estimated', '2', '2-2', '2-7', '2-track', '2.05', '20-30-year-old', '200ft', '2012-early', '207', '209', '20cm', '20s-30s', '20th', '21', '210', '23rd', '24th', '251', '260110', '27', '27th', '28', '3', '3-car', '3-story', '3.55', '30', '30-inch', '30th', '33', '338', '339th', '33rd', '34-84/contact', '34th', 

In [449]:
X_dtm

<10529x8504 sparse matrix of type '<class 'numpy.int64'>'
	with 162137 stored elements in COOrdinate format>

In [450]:
Xa_dtm = X_dtm.toarray()

**MAKE INTO DF**

**`pd.DataFrame(array.dtm, colmns = [vect.get_feature_names])`??**

In [454]:
Xa_dtm.shape

(10529, 8504)