In [1]:
from __future__ import print_function

import csv
import codecs
import sys
import string
import time
import numpy as np
import matplotlib.pyplot as plt

from collections import defaultdict
from random import shuffle
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.classify import SklearnClassifier
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.mixture import GaussianMixture



[nltk_data] Downloading package punkt to /home/junyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/junyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/junyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def parseReview(line):
    if line[1] == "__label1__":
        s = "fake"
    else:
        s = "real"
    return (line[0], line[2], line[3], line[4], line[8], s)

def preProcess(text):
    m = {key: "" for key in string.punctuation}
    #table = string.maketrans(m)
    lemmatizer = WordNetLemmatizer()
    filtered_tokens=[]
    lemmatized_tokens = []
    stop_words = set(stopwords.words('english'))
    text = text.translate(m)
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens

def loadData(path, rawData):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        categories = {}
        ind = 0
        for line in reader:
            (Id, rating, verified_purchase, product_category, text, Label) = parseReview(line)
            if product_category not in categories:
                category = ind
                categories[product_category] = ind
                ind += 1
            vp = 1 if verified_purchase == 'Y' else 0
	    filtered_tokens = preProcess(unicode(text, 'utf-8'))
            rawData.append(' '.join(filtered_tokens))

def processData(rawData):
    shuffle(rawData)
    X = []
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(rawData)
    X = X.toarray()
    return X

In [3]:
rawData = []
featureDict = defaultdict(int)
loadData('reviews_4000.txt', rawData)

In [4]:
X = processData(rawData)

In [None]:
clusters = [
               (GaussianMixture, 'kmeans', {'covariance_type':'full'}),             
           ]
sample_size = [100, 300, 900, 1800, 4000]
max_iters = np.array([1, 125,  250, 500, 750, 875, 1000])
clf_error = {}
plt.figure()
legends = []
starttime = time.time()
for factory, init, params in clusters:
    print('Evaluate of {} with {} init'.format(factory.__name__, init))
    for size in sample_size:
        inertia = np.empty(len(max_iters))
        for i, max_iter in enumerate(max_iters):
            km = factory(n_components=2, init_params = init, max_iter=max_iter, n_init=2, **params).fit(X)
            inertia[i] = km.inertia_ 
        plt.plot(max_iters, inertia)
        legends.append("{} with size = {}".format(factory.__name__, size))
    fig = plt.gcf()
    ax = plt.gca()
    plt.title(factory.__name__)
    plt.xlabel('iteration')
    plt.ylabel('inertia')
    plt.legend(legends)
    fig.savefig(factory.__name__+'2')
print('running time is {} s'.format(time.time()-starttime))

Evaluate of GaussianMixture with kmeans init
