In [2]:
from scrape import take_the_content, URL_NEG, URL_POS
from pre_process import clean_the_content, make_word_token, make_more_clean, make_n_grams
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import random
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
import store_params


In [3]:
# STEP 1 : Scrape the Content from the URL
pos = take_the_content(URL = URL_POS)
neg = take_the_content(URL = URL_NEG)

In [4]:
# STEP 2 : PreProcess (Lowering + Sent_tokenize + N-Grams + Lemmatize)
dataText = []
all_dataText = []

for categoryText,categoryName in zip([pos,neg],["pos","neg"]) : 
    result = clean_the_content(categoryText)
    for sentence in result : 
        raw_word_token = make_n_grams(sentence)
        raw_word_token = make_word_token(raw_word_token)
        # Clean the stopwords and do lemmatization and n-grams also
        clean_word_token = make_more_clean(raw_word_token)
        dataText.append([clean_word_token,categoryName])
        all_dataText.extend(clean_word_token)
print("STEP 2 Already Done")

STEP 2 Already Done


In [None]:
all_dataText = nltk.FreqDist(all_dataText).most_common(3000)

In [9]:
import numpy as np

In [16]:
with open("data.txt","a") as f : 
    for i in all_dataText : 
        f.write(i[0]+"\n")

In [None]:
# STEP 3 : Process the dataText to become the data that can be trained 
for text_idx in range(len(dataText)) :
    result = dict()
    data_before = set(dataText[text_idx][0])
    for all_key in all_dataText : 
        result[all_key[0]] = all_key[0] in data_before
    dataText[text_idx][0] = result
for _ in range(5) : 
    random.shuffle(dataText)

print("STEP 3 Already Done")

STEP 3 Already Done


In [5]:
# STEP 4 : Split Data Train and Test
data_train = dataText[:int(len(dataText)*0.8)]
data_test = dataText[int(len(dataText)*0.8):]

In [6]:
# STEP 5 : Build a Model [FIT DATA INTO MODEL]
print("TRAINING MODE STARTED")
model_original = nltk.NaiveBayesClassifier.train(data_train)

TRAINING MODE STARTED


In [7]:
model_linear_svc = SklearnClassifier(LinearSVC()).train(data_train)

In [8]:
model_mnb = SklearnClassifier(MultinomialNB()).train(data_train)

In [9]:
model_bnb = SklearnClassifier(BernoulliNB()).train(data_train)

In [10]:
model_lr = SklearnClassifier(LogisticRegression()).train(data_train)

In [11]:
model_sgd = SklearnClassifier(SGDClassifier()).train(data_train)

In [12]:
# STEP 6 : Check the Accuracy of Test Data : 
accuracy = nltk.classify.accuracy(model_original, data_test)
print(f"Accuracy Original : {accuracy}")
accuracy = nltk.classify.accuracy(model_linear_svc, data_test)
print(f"Accuracy Linear SVC : {accuracy}")
accuracy = nltk.classify.accuracy(model_mnb, data_test)
print(f"Accuracy MultinomialNB : {accuracy}")
accuracy = nltk.classify.accuracy(model_bnb, data_test)
print(f"Accuracy BernoulliNB : {accuracy}")
accuracy = nltk.classify.accuracy(model_lr, data_test)
print(f"Accuracy Logist Regr : {accuracy}")
accuracy = nltk.classify.accuracy(model_sgd, data_test)
print(f"Accuracy Grad Descent : {accuracy}")

# STEP 6 - Post : Check the Answer of First Data Test 
prediction = model_original.classify(data_test[0][0])
print(str(prediction))

Accuracy Original : 0.7524613220815752
Accuracy Linear SVC : 0.720112517580872
Accuracy MultinomialNB : 0.7487107360525082
Accuracy BernoulliNB : 0.757149554617909
Accuracy Logist Regr : 0.7374589779653071
Accuracy Grad Descent : 0.7257383966244726
neg


In [24]:
method = store_params.SavePickle(original = model_original, lin_svc = model_linear_svc, mnb = model_mnb, bnb = model_bnb,
                                 log_r = model_lr, sgd = model_sgd).save()

Saved Already


In [28]:
model_original.show_most_informative_features()

Most Informative Features
                  boring = True              neg : pos    =     27.8 : 1.0
                    warm = True              pos : neg    =     18.2 : 1.0
               wonderful = True              pos : neg    =     16.9 : 1.0
                 routine = True              neg : pos    =     16.4 : 1.0
                  stupid = True              neg : pos    =     15.8 : 1.0
                provides = True              pos : neg    =     15.6 : 1.0
              engrossing = True              pos : neg    =     14.2 : 1.0
                touching = True              pos : neg    =     14.1 : 1.0
                    flat = True              neg : pos    =     13.1 : 1.0
                   waste = True              neg : pos    =     13.1 : 1.0
