In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import string


In [2]:
ng = fetch_20newsgroups()

In [3]:
x,y = ng.data, ng.target
Xtrain, Xtest, Ctrain, Ctest = train_test_split(x,y)

In [4]:
stop = list(ENGLISH_STOP_WORDS)

In [5]:
# Generates dictionary for a file, with words as keys, and tf value as values

def tf(file):
    file = file.split()
    doc_dict = {}
    for word in file:
        word = word.lower()

        for c in string.punctuation:
            word = word.replace(c,"")
    
        if word not in stop:
            if word in doc_dict:
                doc_dict[word] = doc_dict[word] + 1
            else:
                doc_dict[word] = 1
    
    #normalizing the dictionary
    length = float(len(file))
    for word in doc_dict:
        doc_dict[word] = doc_dict[word] / length
    
    return doc_dict
    

In [6]:
# creates a dictionary of the count of words appearing in all training data

word_count = {}

for x in Xtrain:
    dic = tf(x)
    for word in dic:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1


In [7]:

# returns amount of times word occurs in all files in training data
def count(term):
    if term not in word_count:
        return 1
    return word_count[term]

# takes document dictionary and creates tfidf dictionary from it
def idf(doc_dic):
    new = {}
    for word in doc_dic:
        new[word] = doc_dic[word]*math.log(len(Xtrain)/count(word))

    return new

In [8]:
# creates a all encompassing dictionary for all news types.
# each key is 0-19, the story types, and tfidf values

def classify():
    final = {}
    for x in range(20):
        final[x] = {}
    for x in range(len(Xtrain)):
        doc_dict = idf(tf(Xtrain[x]))
        
        for key in final[Ctrain[x]]:
            if key in doc_dict:
                final[Ctrain[x]][key] *= doc_dict[key]

        for key in doc_dict:
            if key not in final[Ctrain[x]]:
                final[Ctrain[x]][key] = doc_dict[key]

    return final
        
final = classify()

In [10]:
# go through test data and make guesses for its news type

guesses = []

for doc in Xtest:

    score = {}
    doc_dic = idf(tf(doc))
    for classes in final:
        score[classes] = 0
        for word in doc_dic:
            if word in final[classes]:
                score[classes] += 1
                #score[classes] += final[classes][word]+doc_dic[word]


    guesses.append(max(score, key=score.get))



In [11]:
# compares calculated guesses to true values
# calculates accuracy of correctness

correct = 0
wrong = 0

for x in range(len(Ctest)):
    if Ctest[x] == guesses[x]:
        correct += 1
    else:
        wrong += 1

print((correct/(correct+wrong))*100,"% of test data was identified correctly")

86.42629904559915 % of test data was identified correctly
