In [2]:
import numpy as np 
import re 
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files


# Import DataSet

In [6]:
reviews = load_files('txt_sentoken/')
X,y = reviews.data,reviews.target

# Storing as Pickle Files

In [8]:
with open('X.pickle','wb') as f:
    pickle.dump(X,f)

In [9]:
with open('y.pickle','wb') as f:
    pickle.dump(y,f)

# Unpickling The DataSet

In [10]:
with open('X.pickle','rb') as f:
    X = pickle.load(f)

In [11]:
with open('y.pickle','rb') as f:
    y = pickle.load(f)

# Processing The Data

In [13]:
corpus = []
for i in range(0,len(X)):
    review = re.sub(r"\W"," ",str(X[i]))
    review = review.lower()
    review = re.sub(r"\s+[a-zA-Z]\s+"," ",review)
    review = re.sub(r"^[a-z]\s+"," ",review)
    review = re.sub(r"\s+"," ",review)
    corpus.append(review)

# BOW Model

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vetorizer = CountVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words=stopwords.words('english'))

In [16]:
X = vetorizer.fit_transform(corpus).toarray()

In [17]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 0, 1, ..., 0, 0, 0]], dtype=int64)

# Transform BOW into TF-IDF

In [18]:
from sklearn.feature_extraction.text import  TfidfTransformer

In [19]:
transformer = TfidfTransformer()

In [22]:
X = transformer.fit_transform(X).toarray()

In [23]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04815829, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14153766, 0.        , 0.07844951, ..., 0.        , 0.        ,
        0.        ]])

# TfidfVectorizer 

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
vectorizer = TfidfVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words=stopwords.words('english'))

In [69]:
X = vectorizer.fit_transform(corpus).toarray()

In [70]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12007883, 0.        , 0.06321361, ..., 0.        , 0.        ,
        0.        ]])

# Training Test Split

In [45]:
from sklearn.model_selection import  train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Traning Our Classifier

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
clf = LogisticRegression()

In [59]:
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Testing Model Performance

In [60]:
y_pred = clf.predict(X_test)

In [61]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [62]:
cm = confusion_matrix(y_test , y_pred)

In [63]:
print('Confusion Matrix')
print(cm)

Confusion Matrix
[[152  38]
 [ 32 178]]


In [64]:
accuracy = accuracy_score(y_test , y_pred)

In [65]:
print('Accuracy')
print(accuracy)

Accuracy
0.825


# Saving Our Model

In [66]:
with open("clf.pickle",'wb') as f:
    pickle.dump(clf,f)

In [71]:
with open("tfidfmodel.pickle",'wb') as f:
    pickle.dump(vectorizer,f)

 # Using Our Model

In [81]:
with open('clf.pickle','rb') as f:
    clf = pickle.load(f)

In [82]:
with open('tfidfmodel.pickle','rb') as f:
    tfidf = pickle.load(f)

In [83]:
ex = ["You are a nice person man, have a good life"]

In [84]:
ex = tfidf.transform(ex).toarray()

In [85]:
print(clf.predict(ex))

[1]


In [86]:
ex = ["You are a bad person man, have a bad life"]

In [87]:
ex = tfidf.transform(ex).toarray()

In [88]:
print(clf.predict(ex))

[0]
