## Logistic Regression

In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
with open('20ng-train-all-terms.txt') as f:
    train = f.read()
with open('20ng-test-all-terms.txt') as f:
    test = f.read()

train = train[:-1]
test = test[:-1]
topic_train = []
topic_test = []
news_train = []
news_test = []
for new in train.split('\n'):
    topic_train.append(new.split('\t')[0])
    news_train.append(new.split('\t')[1])
print('number of training news: ' + str(len(news_train)))
for new in test.split('\n'):
    topic_test.append(new.split('\t')[0])
    news_test.append(new.split('\t')[1])
print('number of test news: ' + str(len(news_test)))

number of training news: 11293
number of test news: 7528


In [3]:
topic_map = {'alt.atheism' : 0,
    'comp.graphics': 1,
    'comp.os.ms-windows.misc': 2,
    'comp.sys.ibm.pc.hardware': 3,
    'comp.sys.mac.hardware': 4,
    'comp.windows.x': 5,
    'misc.forsale': 6,
    'rec.autos': 7,
    'rec.motorcycles': 8,
    'rec.sport.baseball': 9,
    'rec.sport.hockey': 10,
    'sci.crypt': 11,
    'sci.electronics': 12,
    'sci.med': 13,
    'sci.space': 14,
    'soc.religion.christian': 15,
    'talk.politics.guns': 16,
    'talk.politics.mideast': 17,
    'talk.politics.misc': 18,
    'talk.religion.misc': 19}

In [4]:
df_train = pd.DataFrame(np.array(news_train), columns = ['news'])
df_train['target'] = pd.DataFrame(np.array(topic_train), columns = ['topic'])
df_train = df_train.reindex(np.random.permutation(df_train.index)).reset_index(drop = True)
df_train['target'] = df_train['target'].replace(topic_map)
df_train.head()

Unnamed: 0,news,target
0,re top ten ways slick willie could improve his...,18
1,re q at mhz it s fine in article lee husc harv...,4
2,re hell mets in article mavenry altcit eskimo ...,8
3,re batf fbi revenge in article rwing uucp pat ...,16
4,re was go hezbollah in article apr src honeywe...,17


In [5]:
df_test = pd.DataFrame(np.array(news_test), columns = ['news'])
df_test['target'] = pd.DataFrame(np.array(topic_test), columns = ['topic'])
df_test = df_test.reindex(np.random.permutation(df_test.index)).reset_index(drop = True)
df_test['target'] = df_test['target'].replace(topic_map)
df_test.head()

Unnamed: 0,news,target
0,re shaft drives and wheelies in article wrs ws...,8
1,re abc coverage in article bu edu icop csa bu ...,10
2,re xv has escaped wasn t the shareware fee a s...,5
3,re waterbed for sale in article apr magnus acs...,6
4,hawaii tickets forsale hallo all my girlfriend...,6
5,need help info on hard drive terms hi all i ve...,3
6,pwter forsberg can some on e give me some stat...,10
7,re space manuevering tug was hst servicing mis...,14
8,none i am thinking of going on a motorcycle to...,8
9,re israeli media was re israeli terrorism in a...,17


In [6]:
X_train, y_train = df_train['news'], df_train['target']
X_test, y_test = df_test['news'], df_test['target']

In [7]:
%%time
# use CountVectorizer to find three letter tokens, remove stop_words,
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df = 20, max_df = 0.2, stop_words = 'english', token_pattern = '(?u)\\b\\w\\w\\w+\\b')
vect.fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
clflr = LogisticRegression(penalty = 'l2').fit(X_train_vectorized, y_train)


Wall time: 20.1 s


In [10]:
y_predict = clflr.predict(X_test_vectorized)
confusion_matrix(y_predict, y_test)

array([[214,   0,   0,   0,   0,   0,   0,   0,   0,   3,   1,   0,   0,
          4,   1,  11,   1,  19,   5,  27],
       [  1, 275,  26,  13,   4,  32,   3,   5,   2,   1,   0,   3,  17,
          9,   9,   2,   2,   1,   1,   5],
       [  0,  14, 272,  26,   3,  36,   1,   0,   0,   1,   0,   1,   7,
          1,   1,   4,   3,   0,   1,   1],
       [  0,  13,  45, 269,  42,  13,  11,   3,   2,   2,   0,   0,  35,
         10,   1,   2,   1,   0,   2,   2],
       [  1,   9,  14,  29, 296,   6,  12,   5,   1,   2,   2,   6,  17,
          4,   5,   0,   0,   1,   0,   1],
       [  0,  26,  13,   2,   5, 272,   1,   2,   0,   1,   1,   5,   4,
          5,   4,   2,   1,   4,   1,   0],
       [  3,  10,   1,  13,  10,   7, 344,  14,   4,   4,   1,   4,  13,
          3,   5,   2,   1,   3,   2,   0],
       [  0,   6,   2,   1,   3,   4,   7, 329,  18,   1,   2,   1,   7,
          7,   1,   2,   4,   7,   2,   1],
       [  2,   0,   0,   0,   0,   0,   1,   8, 358,   1,   2,  

In [13]:
print('accuracy is ' + str(accuracy_score(y_predict, y_test)))

accuracy is 0.7784272051009564


In [14]:
print(classification_report(y_predict, y_test))

             precision    recall  f1-score   support

          0       0.67      0.75      0.71       286
          1       0.71      0.67      0.69       411
          2       0.69      0.73      0.71       372
          3       0.69      0.59      0.64       453
          4       0.77      0.72      0.74       411
          5       0.69      0.78      0.73       349
          6       0.88      0.77      0.82       444
          7       0.83      0.81      0.82       405
          8       0.90      0.93      0.91       386
          9       0.88      0.88      0.88       400
         10       0.95      0.93      0.94       408
         11       0.88      0.92      0.90       379
         12       0.68      0.64      0.66       414
         13       0.81      0.83      0.82       389
         14       0.86      0.90      0.88       378
         15       0.84      0.75      0.79       443
         16       0.85      0.71      0.77       436
         17       0.77      0.97      0.86   