In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
df=pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [6]:
replace_map = {"sentiment":{'negative':0,'positive':1}}
df.replace(replace_map,inplace=True)

In [7]:
np.bincount(df['sentiment'])

array([25000, 25000], dtype=int64)

In [8]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


# Data Cleaning

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
df['review'][0][0:200]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo"

In [11]:
def remove_pun(rev):
    sent=[c for c in rev if c not in string.punctuation]
    sent_join=''.join(sent)
    return sent_join

In [12]:
remove_pun(df['review'][0][0:200])

'One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked They are right as this is exactly what happened with mebr br The first thing that struck me abo'

In [13]:
df['no_pun']=df['review'].apply(remove_pun)

In [14]:
df.head()

Unnamed: 0,review,sentiment,no_pun
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production br br The filmin...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,Basically theres a family where a little boy J...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Matteis Love in the Time of Money is a ...


In [15]:
dum=df.copy()

In [16]:
#from nltk.corpus import stopwords
def wtokenizer(msg):
    wtoken=nltk.word_tokenize(msg.lower())
    #stop_words = stopwords.words('english')
    #wtoken=[word for word in wtoken if word not in stop_words]
    wtoken_join=' '.join(wtoken)
    return wtoken_join

In [17]:
dum['tokens']=dum['no_pun'].apply(wtokenizer)#nltk.word_tokenize(df['no_pun'])

In [18]:
dum['no_pun'][0][0:200]

'One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked They are right as this is exactly what happened with mebr br The first thing that struck me about Oz was '

In [24]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def POS_tagging(msg):
    wt=nltk.word_tokenize(msg)
    ps = PorterStemmer()
    k=nltk.pos_tag(wt)
    stop_words = stopwords.words('english')
    sent=[ps.stem(i[0])+'_'+i[1] for i in k if i[0] not in stop_words]
    sent_join=' '.join(sent)
    return sent_join

In [25]:
#POS_tagging(dum['tokens'][0])
dum['tokens']=dum['tokens'].apply(POS_tagging)

In [26]:
dum['tokens'][0][0:200]

'one_CD review_NNS mention_VBN watch_VBG 1_CD oz_JJ episod_NN youll_NN hook_VBN right_JJ exactli_RB happen_VBD mebr_NN br_IN first_JJ thing_NN struck_VBD oz_NN brutal_NN unflinch_JJ scene_NNS violenc_N'

In [27]:
dum['tokens'].head()

0    one_CD review_NNS mention_VBN watch_VBG 1_CD o...
1    wonder_JJ littl_JJ product_NN br_NN br_IN film...
2    thought_VBD wonder_JJ way_NN spend_VB time_NN ...
3    basic_RB there_VBZ famili_NN littl_JJ boy_JJ j...
4    petter_NN mattei_NN love_NN time_NN money_NN v...
Name: tokens, dtype: object

In [28]:
dum.columns

Index(['review', 'sentiment', 'no_pun', 'tokens'], dtype='object')

In [29]:
x=dum['tokens']
y=dum['sentiment']

In [30]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.30, random_state=0)

In [31]:
x_train.head()

17967    prereleas_JJ version_NN 1933_CD babi_NN face_N...
32391    shine_NN wit_VBN visual_JJ flair_NN icon_JJ pe...
9341     subject_NN world_NN war_NN ii_NN robert_NN rya...
7929     saw_VBD movi_NN first_JJ second_VBZ voic_NN tr...
46544    kooki_NN funni_JJ bit_NN divers_NN kind_NN see...
Name: tokens, dtype: object

In [32]:
y_train.head()

17967    1
32391    0
9341     1
7929     1
46544    1
Name: sentiment, dtype: int64

In [33]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(35000,) (15000,) (35000,) (15000,)


In [34]:
print('In train data:')
np.bincount(y_train)

In train data:


array([17460, 17540], dtype=int64)

In [35]:
print('In test data:')
np.bincount(y_test)

In test data:


array([7540, 7460], dtype=int64)

# Using Bag of Words

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
xb_train=vectorizer.fit_transform(x_train)
xb_test=vectorizer.transform(x_test)

In [93]:
feature_names=vectorizer.get_feature_names()
print("No. of features: {}".format(len(feature_names)))
print("features 5000-5050: {}".format(feature_names[5000:5050]))

No. of features: 211021
features 5000-5050: ['address_jj', 'address_nn', 'address_nns', 'address_vb', 'address_vbd', 'address_vbg', 'address_vbn', 'address_vbp', 'address_vbz', 'addressbook_nn', 'addressbr_nn', 'addressedbr_jj', 'addsbr_vbz', 'addsubtractmultiplydivid_rb', 'addtion_nn', 'addtion_rb', 'addytown_in', 'ade_jj', 'ade_nn', 'ade_nns', 'ade_vbp', 'adebesi_nn', 'adebisi_jj', 'adebisi_nn', 'adebisi_vbz', 'adel_jj', 'adel_nn', 'adel_nns', 'adel_rb', 'adel_vb', 'adel_vbp', 'adel_vbz', 'adela_jj', 'adela_nn', 'adela_nns', 'adelad_nn', 'adelaid_in', 'adelaid_jj', 'adelaid_nn', 'adelaid_nns', 'adelaid_rb', 'adelaid_vbp', 'adelaidebr_jj', 'adelebr_jj', 'adelebr_nn', 'adelehi_jj', 'adelight_jj', 'aden_jj', 'adenoid_jj', 'adenoid_nn']


In [38]:
vectorizer.vocabulary_

{'prereleas_jj': 145652,
 'version_nn': 198745,
 '1933_cd': 865,
 'babi_nn': 15927,
 'face_nn': 64455,
 'would_md': 207432,
 'make_vb': 112655,
 'ideal_jj': 90733,
 'introduct_nn': 94650,
 'corpor_jj': 41484,
 'seminar_nn': 164667,
 'sexual_jj': 165853,
 'harass_nn': 82876,
 'mentor_vbn': 117628,
 'nietszchean_jj': 127796,
 'professor_nn': 146748,
 'lili_jj': 107690,
 'power_nns': 144801,
 'rise_vbz': 156501,
 'life_nn': 107230,
 'easi_jj': 56976,
 'virtu_nn': 199809,
 'father_nns': 65942,
 'speakeasi_vbp': 174347,
 'rapid_jj': 150771,
 'climb_nn': 36743,
 'ladder_nn': 103581,
 'larg_jj': 104396,
 'bank_nn': 17219,
 'rung_nn': 158878,
 'execut_nn': 63065,
 'brain_nn': 25572,
 'belt_nn': 19887,
 'ethic_nns': 61827,
 'lock_vbn': 108947,
 'vault_nn': 198232,
 'film_nn': 67648,
 'victim_nns': 199040,
 'except_in': 62828,
 'lili_nn': 107691,
 'childhood_nn': 34524,
 'destroy_vbn': 49389,
 'abus_jj': 3750,
 'exploit_nn': 63669,
 'father_nn': 65941,
 'destruct_jj': 49395,
 'relationship_nn': 

In [39]:
words=vectorizer.get_feature_names()[10000:10020]
pd.DataFrame(xb_train[10:15,10000:10020].todense(),columns=words)

Unnamed: 0,angelscourag_nn,angelsmost_nn,angelsnot_jj,angelu_nn,angelwa_rb,angelyn_nn,anger_jj,anger_jjr,anger_nn,anger_nns,anger_rb,anger_rbr,anger_rp,anger_vb,anger_vbd,anger_vbg,anger_vbn,anger_vbp,anger_vbz,angerbr_in
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [41]:
cross_val_score(LogisticRegression(),xb_train,y_train,cv=5).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8839714285714285

In [42]:
logreg=LogisticRegression()
logreg.fit(xb_train,y_train)
print("Train score: {}".format(logreg.score(xb_train,y_train)))
print("Test score: {}".format(logreg.score(xb_test,y_test)))

Train score: 0.9992285714285715
Test score: 0.8823333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
pred=logreg.predict(xb_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[6621,  919],
       [ 846, 6614]], dtype=int64)

In [44]:
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score

In [45]:
accuracy_score(y_test,pred)

0.8823333333333333

In [46]:
precision_score(y_test,pred)

0.878003451480154

In [47]:
recall_score(y_test,pred)

0.8865951742627346

In [48]:
f1_score(y_test,pred)

0.8822783965850731

# Using TF-IDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
tfidf=TfidfVectorizer()

In [51]:
xi_train=tfidf.fit_transform(x_train)
xi_test=tfidf.transform(x_test)

In [74]:
feature_names=tfidf.get_feature_names()
print("No. of features: {}".format(len(feature_names)))
print("features 5000-5050: {}".format(feature_names[5000:5050]))

No. of features: 211021
features 5000-5050: ['address_jj', 'address_nn', 'address_nns', 'address_vb', 'address_vbd', 'address_vbg', 'address_vbn', 'address_vbp', 'address_vbz', 'addressbook_nn', 'addressbr_nn', 'addressedbr_jj', 'addsbr_vbz', 'addsubtractmultiplydivid_rb', 'addtion_nn', 'addtion_rb', 'addytown_in', 'ade_jj', 'ade_nn', 'ade_nns', 'ade_vbp', 'adebesi_nn', 'adebisi_jj', 'adebisi_nn', 'adebisi_vbz', 'adel_jj', 'adel_nn', 'adel_nns', 'adel_rb', 'adel_vb', 'adel_vbp', 'adel_vbz', 'adela_jj', 'adela_nn', 'adela_nns', 'adelad_nn', 'adelaid_in', 'adelaid_jj', 'adelaid_nn', 'adelaid_nns', 'adelaid_rb', 'adelaid_vbp', 'adelaidebr_jj', 'adelebr_jj', 'adelebr_nn', 'adelehi_jj', 'adelight_jj', 'aden_jj', 'adenoid_jj', 'adenoid_nn']


In [53]:
tfidf.vocabulary_

{'prereleas_jj': 145652,
 'version_nn': 198745,
 '1933_cd': 865,
 'babi_nn': 15927,
 'face_nn': 64455,
 'would_md': 207432,
 'make_vb': 112655,
 'ideal_jj': 90733,
 'introduct_nn': 94650,
 'corpor_jj': 41484,
 'seminar_nn': 164667,
 'sexual_jj': 165853,
 'harass_nn': 82876,
 'mentor_vbn': 117628,
 'nietszchean_jj': 127796,
 'professor_nn': 146748,
 'lili_jj': 107690,
 'power_nns': 144801,
 'rise_vbz': 156501,
 'life_nn': 107230,
 'easi_jj': 56976,
 'virtu_nn': 199809,
 'father_nns': 65942,
 'speakeasi_vbp': 174347,
 'rapid_jj': 150771,
 'climb_nn': 36743,
 'ladder_nn': 103581,
 'larg_jj': 104396,
 'bank_nn': 17219,
 'rung_nn': 158878,
 'execut_nn': 63065,
 'brain_nn': 25572,
 'belt_nn': 19887,
 'ethic_nns': 61827,
 'lock_vbn': 108947,
 'vault_nn': 198232,
 'film_nn': 67648,
 'victim_nns': 199040,
 'except_in': 62828,
 'lili_nn': 107691,
 'childhood_nn': 34524,
 'destroy_vbn': 49389,
 'abus_jj': 3750,
 'exploit_nn': 63669,
 'father_nn': 65941,
 'destruct_jj': 49395,
 'relationship_nn': 

In [75]:
iwords=vectorizer.get_feature_names()[10000:10020]
pd.DataFrame(xi_train[10:15,10000:10020].todense(),columns=iwords)

Unnamed: 0,angelscourag_nn,angelsmost_nn,angelsnot_jj,angelu_nn,angelwa_rb,angelyn_nn,anger_jj,anger_jjr,anger_nn,anger_nns,anger_rb,anger_rbr,anger_rp,anger_vb,anger_vbd,anger_vbg,anger_vbn,anger_vbp,anger_vbz,angerbr_in
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
cross_val_score(LogisticRegression(),xi_train,y_train,cv=5).mean()

0.8878

In [56]:
ilogreg=LogisticRegression()
ilogreg.fit(xi_train,y_train)
print("Train score: {}".format(ilogreg.score(xi_train,y_train)))
print("Test score: {}".format(ilogreg.score(xi_test,y_test)))

Train score: 0.9398571428571428
Test score: 0.8890666666666667


In [57]:
ipred=ilogreg.predict(xi_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,ipred)

array([[6610,  930],
       [ 734, 6726]], dtype=int64)

In [58]:
accuracy_score(y_test,ipred)

0.8890666666666667

In [59]:
precision_score(y_test,ipred)

0.8785266457680251

In [60]:
recall_score(y_test,ipred)

0.9016085790884718

In [61]:
f1_score(y_test,ipred)

0.8899179677163269

# Using n-gram

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
nvect = CountVectorizer(ngram_range=(2,2))
xn_train=nvect.fit_transform(x_train)
xn_test=nvect.transform(x_test)

In [73]:
fnames=nvect.get_feature_names()
print("No. of features: {}".format(len(fnames)))
print("features 5000-5050: {}".format(fnames[5000:5050]))

No. of features: 2668269
features 5000-5050: ['1927_cd capt_nn', '1927_cd cinemat_jj', '1927_cd didnt_vbp', '1927_cd ever_rb', '1927_cd film_nn', '1927_cd final_jj', '1927_cd first_jj', '1927_cd hollywood_nn', '1927_cd inevit_jj', '1927_cd inspir_jj', '1927_cd jimmi_jj', '1927_cd look_vbg', '1927_cd luck_nn', '1927_cd money_nn', '1927_cd noel_nn', '1927_cd perhap_rb', '1927_cd play_nn', '1927_cd return_vbz', '1927_cd silent_jj', '1927_cd simultan_rb', '1927_cd solo_nn', '1927_cd spinoff_nn', '1927_cd stand_vbz', '1927_cd star_vbz', '1927_cd stewart_nn', '1927_cd total_rb', '1927_cd want_vbd', '1927_cd young_jj', '1928_cd banner_nn', '1928_cd cameo_nn', '1928_cd camera_nns', '1928_cd davi_nns', '1928_cd enter_vbz', '1928_cd except_nn', '1928_cd famous_rb', '1928_cd fifti_jj', '1928_cd geograp_rb', '1928_cd hous_vbn', '1928_cd isnt_nn', '1928_cd josef_nn', '1928_cd last_jj', '1928_cd mani_jj', '1928_cd movi_nns', '1928_cd often_rb', '1928_cd one_cd', '1928_cd orchestr_jj', '1928_cd origi

In [64]:
nvect.vocabulary_

{'prereleas_jj version_nn': 1818850,
 'version_nn 1933_cd': 2506212,
 '1933_cd babi_nn': 5418,
 'babi_nn face_nn': 193887,
 'face_nn would_md': 820217,
 'would_md make_vb': 2635278,
 'make_vb ideal_jj': 1436871,
 'ideal_jj introduct_nn': 1163420,
 'introduct_nn corpor_jj': 1220168,
 'corpor_jj seminar_nn': 517322,
 'seminar_nn sexual_jj': 2071857,
 'sexual_jj harass_nn': 2093769,
 'harass_nn mentor_vbn': 1074630,
 'mentor_vbn nietszchean_jj': 1495888,
 'nietszchean_jj professor_nn': 1617383,
 'professor_nn lili_jj': 1839564,
 'lili_jj power_nns': 1367759,
 'power_nns rise_vbz': 1811707,
 'rise_vbz life_nn': 1967537,
 'life_nn easi_jj': 1348621,
 'easi_jj virtu_nn': 705060,
 'virtu_nn father_nns': 2521880,
 'father_nns speakeasi_vbp': 847540,
 'speakeasi_vbp rapid_jj': 2196039,
 'rapid_jj climb_nn': 1881296,
 'climb_nn corpor_jj': 448246,
 'corpor_jj ladder_nn': 517293,
 'ladder_nn larg_jj': 1303066,
 'larg_jj bank_nn': 1309358,
 'bank_nn rung_nn': 208467,
 'rung_nn ladder_nn': 1992451,

In [65]:
nwords=nvect.get_feature_names()[10000:10020]
pd.DataFrame(xn_train[10:15,10000:10020].todense(),columns=nwords)

Unnamed: 0,1986_cd film_nn,1986_cd fun_nn,1986_cd golden_jj,1986_cd good_jj,1986_cd got_vbd,1986_cd guitar_nn,1986_cd henc_nn,1986_cd hous_nn,1986_cd hype_nn,1986_cd im_jj,1986_cd initi_rb,1986_cd iron_rb,1986_cd knockoff_nn,1986_cd later_rb,1986_cd less_nn,1986_cd like_vbd,1986_cd made_vbd,1986_cd money_nn,1986_cd movi_nn,1986_cd must_md
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [66]:
cross_val_score(LogisticRegression(),xn_train,y_train,cv=5).mean()

0.8462857142857143

In [67]:
nlogreg=LogisticRegression()
nlogreg.fit(xn_train,y_train)
print("Train score: {}".format(nlogreg.score(xn_train,y_train)))
print("Test score: {}".format(nlogreg.score(xn_test,y_test)))

Train score: 1.0
Test score: 0.8532


In [68]:
npred=nlogreg.predict(xn_test)
confusion_matrix(y_test,npred)

array([[6252, 1288],
       [ 914, 6546]], dtype=int64)

In [69]:
accuracy_score(y_test,npred)

0.8532

In [70]:
precision_score(y_test,npred)

0.8355884605565483

In [71]:
recall_score(y_test,npred)

0.8774798927613942

In [72]:
f1_score(y_test,npred)

0.8560219693997646

# n-gram with TF-IDF

In [79]:
model=TfidfVectorizer(ngram_range=(2,2))

In [81]:
xg_train=model.fit_transform(x_train)
xg_test=model.transform(x_test)

In [82]:
f_names=model.get_feature_names()
print("No. of features: {}".format(len(f_names)))
print("features 5000-5050: {}".format(f_names[5000:5050]))

No. of features: 2668269
features 5000-5050: ['1927_cd capt_nn', '1927_cd cinemat_jj', '1927_cd didnt_vbp', '1927_cd ever_rb', '1927_cd film_nn', '1927_cd final_jj', '1927_cd first_jj', '1927_cd hollywood_nn', '1927_cd inevit_jj', '1927_cd inspir_jj', '1927_cd jimmi_jj', '1927_cd look_vbg', '1927_cd luck_nn', '1927_cd money_nn', '1927_cd noel_nn', '1927_cd perhap_rb', '1927_cd play_nn', '1927_cd return_vbz', '1927_cd silent_jj', '1927_cd simultan_rb', '1927_cd solo_nn', '1927_cd spinoff_nn', '1927_cd stand_vbz', '1927_cd star_vbz', '1927_cd stewart_nn', '1927_cd total_rb', '1927_cd want_vbd', '1927_cd young_jj', '1928_cd banner_nn', '1928_cd cameo_nn', '1928_cd camera_nns', '1928_cd davi_nns', '1928_cd enter_vbz', '1928_cd except_nn', '1928_cd famous_rb', '1928_cd fifti_jj', '1928_cd geograp_rb', '1928_cd hous_vbn', '1928_cd isnt_nn', '1928_cd josef_nn', '1928_cd last_jj', '1928_cd mani_jj', '1928_cd movi_nns', '1928_cd often_rb', '1928_cd one_cd', '1928_cd orchestr_jj', '1928_cd origi

In [83]:
model.vocabulary_

{'prereleas_jj version_nn': 1818850,
 'version_nn 1933_cd': 2506212,
 '1933_cd babi_nn': 5418,
 'babi_nn face_nn': 193887,
 'face_nn would_md': 820217,
 'would_md make_vb': 2635278,
 'make_vb ideal_jj': 1436871,
 'ideal_jj introduct_nn': 1163420,
 'introduct_nn corpor_jj': 1220168,
 'corpor_jj seminar_nn': 517322,
 'seminar_nn sexual_jj': 2071857,
 'sexual_jj harass_nn': 2093769,
 'harass_nn mentor_vbn': 1074630,
 'mentor_vbn nietszchean_jj': 1495888,
 'nietszchean_jj professor_nn': 1617383,
 'professor_nn lili_jj': 1839564,
 'lili_jj power_nns': 1367759,
 'power_nns rise_vbz': 1811707,
 'rise_vbz life_nn': 1967537,
 'life_nn easi_jj': 1348621,
 'easi_jj virtu_nn': 705060,
 'virtu_nn father_nns': 2521880,
 'father_nns speakeasi_vbp': 847540,
 'speakeasi_vbp rapid_jj': 2196039,
 'rapid_jj climb_nn': 1881296,
 'climb_nn corpor_jj': 448246,
 'corpor_jj ladder_nn': 517293,
 'ladder_nn larg_jj': 1303066,
 'larg_jj bank_nn': 1309358,
 'bank_nn rung_nn': 208467,
 'rung_nn ladder_nn': 1992451,

In [84]:
gwords=model.get_feature_names()[10000:10020]
pd.DataFrame(xg_train[10:15,10000:10020].todense(),columns=gwords)

Unnamed: 0,1986_cd film_nn,1986_cd fun_nn,1986_cd golden_jj,1986_cd good_jj,1986_cd got_vbd,1986_cd guitar_nn,1986_cd henc_nn,1986_cd hous_nn,1986_cd hype_nn,1986_cd im_jj,1986_cd initi_rb,1986_cd iron_rb,1986_cd knockoff_nn,1986_cd later_rb,1986_cd less_nn,1986_cd like_vbd,1986_cd made_vbd,1986_cd money_nn,1986_cd movi_nn,1986_cd must_md
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
cross_val_score(LogisticRegression(),xg_train,y_train,cv=5).mean()

0.8390285714285713

In [86]:
glogreg=LogisticRegression()
glogreg.fit(xg_train,y_train)
print("Train score: {}".format(glogreg.score(xg_train,y_train)))
print("Test score: {}".format(glogreg.score(xg_test,y_test)))

Train score: 0.9946571428571429
Test score: 0.8496


In [88]:
gpred=glogreg.predict(xg_test)
confusion_matrix(y_test,gpred)

array([[6279, 1261],
       [ 995, 6465]], dtype=int64)

In [89]:
accuracy_score(y_test,gpred)

0.8496

In [90]:
precision_score(y_test,gpred)

0.8367848822158944

In [91]:
recall_score(y_test,gpred)

0.8666219839142091

In [92]:
f1_score(y_test,gpred)

0.8514421177400238