# Spooky Author Text Classification

# Importing Dependencies and Loading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.drop('id',inplace=True,axis=1)

Creating Dummy class Variables

In [4]:
dummies={'EAP':0,'HPL':1,'MWS':2}
train=train.replace({'author':dummies})

In [5]:
train.head()
X=train['text']
Y=train['author']
X.shape

(19579,)

In [6]:
x_test=test['text']
x_test.shape

(8392,)

Training CountVectorizer on Data

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cvec=CountVectorizer(stop_words='english')

ctrain=cvec.fit_transform(X)

In [8]:
ctest=cvec.transform(x_test)

# Naive Bayes with CountVectorizor

log-loss= 0.46

In [9]:
from sklearn.naive_bayes import MultinomialNB
naive=MultinomialNB(fit_prior=False)
naive.fit(ctrain,Y)
predictions=naive.predict_proba(ctest)

In [10]:
predictions.shape


(8392, 3)

In [11]:
preds=pd.DataFrame()

In [12]:
preds['id']=test['id']
preds["EAP"]=predictions[:,0]
preds["HPL"]=predictions[:,1]
preds["MWS"]=predictions[:,2]

In [13]:
preds.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.003767,0.000688,0.995545
1,id24541,0.999975,2.3e-05,2e-06
2,id00134,0.134762,0.865174,6.4e-05
3,id27757,0.36818,0.631819,1e-06
4,id04081,0.959781,0.030765,0.009454


In [19]:
preds.to_csv('NaiveFITPRIOR.csv')

# Trying with TFID vectorizor 

log-loss= 0.48

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfid= TfidfTransformer()
tftrain = tfid.fit_transform(ctrain)
tftest=tfid.transform(ctest)


In [16]:
from sklearn.naive_bayes import MultinomialNB
bayes=MultinomialNB(fit_prior=False)
bayes.fit(tftrain,Y)
predictions=naive.predict_proba(tftest)

In [50]:
preds_1=pd.DataFrame()

In [51]:
preds_1['id']=test['id']
preds_1["EAP"]=predictions[:,0]
preds_1["HPL"]=predictions[:,1]
preds_1["MWS"]=predictions[:,2]

In [53]:
preds_1.to_csv('NaiveTFID.csv')

# Trying SVM

log-loss= 0.48

In [68]:
from sklearn import svm

In [73]:
sv=svm.SVC(kernel='linear',C=1.0,probability=True)
sv.fit(ctrain,Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [75]:
predict=sv.predict_proba(ctest)

In [76]:
preds_2=pd.DataFrame()

In [80]:
preds_2['id']=test['id']
preds_2["EAP"]=predictions[:,0]
preds_2["HPL"]=predictions[:,1]
preds_2["MWS"]=predictions[:,2]

In [82]:
preds_2.to_csv('svmPY.csv')

# Ensemble Methods

Random Forest : Log-Loss = 0.75

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cvec=CountVectorizer(stop_words='english')
ctrain=cvec.fit_transform(X)
ctest=cvec.transform(x_test)

In [10]:
RF=RandomForestClassifier(n_estimators=50)
RF.fit(ctrain,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
RFpred=RF.predict_proba(ctest)

In [12]:
preds_3=pd.DataFrame()

In [15]:
preds_3['id']=test['id']
preds_3["EAP"]=RFpred[:,0]
preds_3["HPL"]=RFpred[:,1]
preds_3["MWS"]=RFpred[:,2]

In [17]:
preds_3.to_csv('rfPY.csv')

# Voting Classifier

Log-Loss = 0.407

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.calibration import CalibratedClassifierCV

lg=CalibratedClassifierCV(LogisticRegression())
nb=CalibratedClassifierCV(GaussianNB())
mnb=CalibratedClassifierCV(MultinomialNB())

Models=[lg,nb,mnb]

from sklearn.ensemble import VotingClassifier
vc=VotingClassifier(estimators=models,voting='soft')
vc.fit(ctrain,Y)




# models = [('MultiNB', MultinomialNB(alpha=0.03)),
#           ('Calibrated MultiNB', CalibratedClassifierCV(
#               MultinomialNB(alpha=0.03), method='isotonic')),
#           ('Calibrated BernoulliNB', CalibratedClassifierCV(
#               BernoulliNB(alpha=0.03), method='isotonic')),
#           ('Calibrated Huber', CalibratedClassifierCV(
#               SGDClassifier(loss='modified_huber', alpha=1e-4,
#                             ), method='sigmoid')),
#           ('Logit', LogisticRegression(C=30))]



In [51]:
from sklearn.ensemble import VotingClassifier
vc=VotingClassifier(estimators=models,voting='soft')
vc.fit(ctrain,Y)

VotingClassifier(estimators=[('MultiNB', MultinomialNB(alpha=0.03, class_prior=None, fit_prior=True)), ('Calibrated MultiNB', CalibratedClassifierCV(base_estimator=MultinomialNB(alpha=0.03, class_prior=None, fit_prior=True),
            cv=3, method='isotonic')), ('Calibrated BernoulliNB', CalibratedClassifierCV(bas...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

In [84]:
my_pred=vc.predict_proba(ctest.toarray())
my_pred

array([[  2.06118803e-02,   2.99679618e-03,   9.76391324e-01],
       [  9.10845620e-01,   8.89726253e-02,   1.81755137e-04],
       [  2.96155046e-01,   7.03544708e-01,   3.00245201e-04],
       ..., 
       [  9.75288767e-01,   6.36152615e-03,   1.83497071e-02],
       [  4.31620958e-02,   6.60474099e-04,   9.56177430e-01],
       [  5.76718254e-02,   9.42328172e-01,   2.72850141e-09]])

In [53]:
preds_4=pd.DataFrame()

In [54]:
preds_4['id']=test['id']
preds_4["EAP"]=my_pred[:,0]
preds_4["HPL"]=my_pred[:,1]
preds_4["MWS"]=my_pred[:,2]

In [55]:
preds_4.to_csv('Voting.csv')

# Voting Classifier with TFIDF

In [57]:
from sklearn.ensemble import VotingClassifier
vc=VotingClassifier(estimators=models,voting='soft')
vc.fit(tftrain,Y)

VotingClassifier(estimators=[('MultiNB', MultinomialNB(alpha=0.03, class_prior=None, fit_prior=True)), ('Calibrated MultiNB', CalibratedClassifierCV(base_estimator=MultinomialNB(alpha=0.03, class_prior=None, fit_prior=True),
            cv=3, method='isotonic')), ('Calibrated BernoulliNB', CalibratedClassifierCV(bas...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None)

In [58]:
my_preds=vc.predict_proba(tftest)

In [59]:
preds_5=pd.DataFrame()

In [61]:
preds_5['id']=test['id']
preds_5["EAP"]=my_preds[:,0]
preds_5["HPL"]=my_preds[:,1]
preds_5["MWS"]=my_preds[:,2]

In [63]:
preds_5.to_csv('VotingTFID.csv')

In [69]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

models = [('MultiNB', MultinomialNB(alpha=0.03)),
          ('Calibrated MultiNB', CalibratedClassifierCV(
              MultinomialNB(alpha=0.03), method='isotonic')),
          ('Calibrated BernoulliNB', CalibratedClassifierCV(
              BernoulliNB(alpha=0.03), method='isotonic')),
          ('Calibrated Huber', CalibratedClassifierCV(
              SGDClassifier(loss='modified_huber', alpha=1e-4,
                            ), method='sigmoid')),
          ('Logit', LogisticRegression(C=30))]


vectorizer=TfidfVectorizer(token_pattern=r'\w{1,}', sublinear_tf=True, ngram_range=(1,2))
clf = VotingClassifier(models, voting='soft', weights=[3,3,3,1,1])


In [75]:
ctrain.shape

(19579, 24764)

In [77]:
ctest.shape

(8392, 24764)

In [87]:
clf.fit(ctrain, Y)
results = clf.predict_proba(ctest)


In [88]:
final_pred=pd.DataFrame()
results

array([[  6.11584949e-02,   1.93863192e-02,   9.19455186e-01],
       [  9.79983286e-01,   1.90396907e-02,   9.77023020e-04],
       [  4.96806891e-01,   4.99739476e-01,   3.45363305e-03],
       ..., 
       [  9.29560867e-01,   1.77245374e-02,   5.27145955e-02],
       [  9.53358508e-02,   3.51908370e-03,   9.01145066e-01],
       [  2.46861738e-02,   9.74928957e-01,   3.84869471e-04]])

In [90]:
#pd.DataFrame(results, index=test.index, columns=authors).to_csv('results.csv')
p=pd.DataFrame()

In [91]:

p['id']=test['id']
p["EAP"]=results[:,0]
p["HPL"]=results[:,1]
p["MWS"]=results[:,2]

In [92]:
p.to_csv('FINAL.csv')