In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('essays.csv', encoding="latin-1")

In [3]:
df.head(5)

Unnamed: 0,AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [4]:
df=df.iloc[:,1:]

In [5]:
df.isnull().sum()
#no null values

TEXT    0
cEXT    0
cNEU    0
cAGR    0
cCON    0
cOPN    0
dtype: int64

In [6]:
df.duplicated().sum()
#no duplicate values

0

In [7]:
#rename to convinient names
df.rename(columns={'TEXT':'text','cEXT':'ext','cNEU':'neu','cAGR':'agr','cCON':'con','cOPN':'opn'},inplace=True)
df.sample(5)

Unnamed: 0,text,ext,neu,agr,con,opn
1603,I just came back from the Texas Crew Meet...,n,n,y,y,y
1019,I am a bit in wonderous as to what is supposed...,n,n,n,n,y
1035,I am really tired and bored right now. I am a ...,y,y,n,n,n
1433,The first sensation I feel while beginning wri...,n,n,n,n,y
1029,I can't ever seem to remember my passwords. An...,n,y,n,n,y


In [8]:
def change(x):
    if x=='y':
        return 1
    else:
        return 0

In [9]:
#change to numerical values
df['ext'] = df['ext'].apply(change)
df['neu'] = df['neu'].apply(change)
df['agr'] = df['agr'].apply(change)
df['con'] = df['con'].apply(change)
df['opn'] = df['opn'].apply(change)

In [10]:
df.head()

Unnamed: 0,text,ext,neu,agr,con,opn
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [11]:
df.sample(20)

Unnamed: 0,text,ext,neu,agr,con,opn
767,There are mostly worries and fear traveling th...,0,0,0,0,0
122,O. K. Here I am in the computer lab in my dorm...,1,1,1,1,0
155,Trying to track my thoughts. That's weird beca...,0,1,0,1,0
1495,"This assignment seemed so easy at first, but n...",0,1,1,1,0
2280,Yeah so I am so stressed right now and I am wi...,1,1,0,0,0
1422,I guess that I should start by telling you whe...,1,0,0,1,1
1209,I am so overwhelmed that I don't even know wha...,1,1,0,0,0
1197,"Today has been a pretty good day, I guess. It'...",0,1,1,1,0
597,"I think about my mother, my ex-boyfriend in Ho...",0,1,0,0,0
1220,I wondering about what I should write here but...,0,1,0,0,0


In [12]:
#text preprocessing
#small letters
#remove special characters
#remove stopwords
#convert to vector

In [13]:
df['text'] = df['text'].apply( lambda x : x.lower())
df.sample(5)

Unnamed: 0,text,ext,neu,agr,con,opn
2326,okay dokey here we go. just thought i would ge...,0,1,0,1,0
928,ok so now i'm writing i'm doing this so i can ...,0,0,1,0,1
736,twenty minutes seems like a very long time to ...,1,1,0,0,0
989,"""antisocial""--that is what it said in large le...",0,1,1,0,1
1295,stream of conscience. just typing whatever com...,1,1,1,1,1


In [14]:
def remove_special_chars(text):
    result = ""
    for i in text:
        if i.isalnum() or i ==' ':
            result = result + i
        else:
            result = result + ""
    return result.strip()

In [15]:
df['text'] = df['text'].apply(remove_special_chars)
df['text']

0       well right now i just woke up from a midday na...
1       well here we go with the stream of consciousne...
2       an open keyboard and buttons to push the thing...
3       i cant believe it  its really happening  my pu...
4       well here i go with the good old stream of con...
                              ...                        
2462    im home wanted to go to bed but remembered tha...
2463    stream of consiousnesssskdj how do you spell t...
2464    it is wednesday december 8th and a lot has bee...
2465    man this week has been hellish anyways now its...
2466    i have just gotten off the phone with brady im...
Name: text, Length: 2467, dtype: object

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english',max_features = 10000)
X = cv.fit_transform(df['text']).toarray()
X.shape

(2467, 10000)

In [17]:
ext = df['ext'].values.astype('int')
neu = df['neu'].values.astype('int')
agr = df['agr'].values.astype('int')
con = df['con'].values.astype('int')
opn = df['opn'].values.astype('int')

In [18]:
ext = df['ext']
neu=df['neu']
agr = df['agr']
con=df['con']
opn=df['opn']

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,ext_train,ext_test = train_test_split(X,ext,test_size=0.3)
X1_train,X1_test,neu_train,neu_test = train_test_split(X,neu,test_size=0.3)
X2_train,X2_test,agr_train,agr_test = train_test_split(X,agr,test_size=0.3)
X3_train,X3_test,con_train,con_test = train_test_split(X,con,test_size=0.3)
X4_train,X4_test,opn_train,opn_test = train_test_split(X,opn,test_size=0.3)

In [20]:
ext_svm = svm.LinearSVC()
ext_svm.fit(X_train,ext_train)
ext_pred = ext_svm.predict(X_test)
accuracy_score(ext_test,ext_pred)

0.5492577597840755

In [21]:
neu_svm = svm.LinearSVC()
neu_svm.fit(X1_train,neu_train)
neu_pred = neu_svm.predict(X1_test)
accuracy_score(neu_test,neu_pred)

0.5182186234817814

In [22]:
agr_svm = svm.LinearSVC()
agr_svm.fit(X2_train,agr_train)
agr_pred = agr_svm.predict(X2_test)
accuracy_score(agr_test,agr_pred)

0.5398110661268556

In [23]:
con_svm = svm.LinearSVC()
con_svm.fit(X3_train,con_train)
con_pred = con_svm.predict(X3_test)
accuracy_score(con_test,con_pred)

0.5479082321187584

In [24]:
opn_svm = svm.LinearSVC()
opn_svm.fit(X4_train,opn_train)
opn_pred = opn_svm.predict(X4_test)
accuracy_score(opn_test,opn_pred)

0.5951417004048583

In [25]:
#4
text = "Many wishes of my life haven't been fulfilled. This makes me irritated. I feel like my life has been filled with diasspointments."
text = text.lower()
text = remove_special_chars(text)
val = cv.transform([text])
a = ext_svm.predict(val)
b = neu_svm.predict(val)
c = agr_svm.predict(val)
d = con_svm.predict(val)
e = opn_svm.predict(val)

In [26]:
qualities = ['Extroversion','Neuroticism','Agreeableness','Conscientiousness','Openness']
values = [a,b,c,d,e]
for i in range(5):
  if(values[i][0]==0):
    print(qualities[i],': No')
  else:
    print(qualities[i],': Yes')

Extroversion : Yes
Neuroticism : No
Agreeableness : Yes
Conscientiousness : No
Openness : Yes
