In [1]:
import numpy as np 
import re 
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files

# Import DataSet

In [2]:
reviews = load_files('data/')
X,y = reviews.data,reviews.target

In [5]:
with open('X.pickle','wb') as f:
    pickle.dump(X,f)

In [6]:
with open('y.pickle','wb') as f:
    pickle.dump(y,f)

# Storing as Pickle Files

In [7]:
with open('X.pickle','rb') as f:
    X = pickle.load(f)

In [8]:
with open('y.pickle','rb') as f:
    y = pickle.load(f)

# Processing The Data

In [9]:
corpus = []
for i in range(0,len(X)):
    review = re.sub(r"\W"," ",str(X[i]))
    review = review.lower()
    review = re.sub(r"\s+[a-zA-Z]\s+"," ",review)
    review = re.sub(r"^[a-z]\s+"," ",review)
    review = re.sub(r"\s+"," ",review)
    corpus.append(review)

# BOW Model

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vetorizer = CountVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words=stopwords.words('english'))

In [11]:
X = vetorizer.fit_transform(corpus).toarray()

In [12]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Transform BOW into TF-IDF

In [13]:
from sklearn.feature_extraction.text import  TfidfTransformer

In [14]:
transformer = TfidfTransformer()

In [15]:
X = transformer.fit_transform(X).toarray()

In [16]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06891988, 0.13242906, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.22274622, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# TfidfVectorizer 

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer = TfidfVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words=stopwords.words('english'))

In [19]:
X = vectorizer.fit_transform(corpus).toarray()

In [20]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.06891988, 0.13242906, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.22274622, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Training Test Split

In [21]:
from sklearn.model_selection import  train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Traning Our Classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
clf = MultinomialNB()

In [25]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Testing Model Performance

In [26]:
y_pred = clf.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [28]:
cm = confusion_matrix(y_test , y_pred)

In [29]:
print('Confusion Matrix')
print(cm)

Confusion Matrix
[[682  42]
 [ 17 294]]


In [30]:
accuracy = accuracy_score(y_test , y_pred)

In [31]:
print('Accuracy')
print(accuracy)

Accuracy
0.9429951690821256


# Using Adaboost

In [34]:
from sklearn.ensemble import  AdaBoostClassifier

In [35]:
ada = AdaBoostClassifier()

In [36]:
ada.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

# Testing AdaBoost

In [38]:
y_pred = ada.predict(X_test)

In [39]:
cm = confusion_matrix(y_test , y_pred)

In [40]:
print('Confusion Matrix')
print(cm)

Confusion Matrix
[[692  32]
 [ 14 297]]


In [41]:
accuracy = accuracy_score(y_test , y_pred)

In [42]:
print('Accuracy')
print(accuracy)

Accuracy
0.9555555555555556


# Saving Our Model

In [44]:
with open("clf.pickle",'wb') as f:
    pickle.dump(clf,f)

In [45]:
with open("ada.pickle",'wb') as f:
    pickle.dump(clf,f)

In [46]:
with open("tfidfmodel.pickle",'wb') as f:
    pickle.dump(vectorizer,f)

 # Using Our Model

In [47]:
with open('clf.pickle','rb') as f:
    clf = pickle.load(f)

In [48]:
with open('ada.pickle','rb') as f:
    ada = pickle.load(f)

In [49]:
with open('tfidfmodel.pickle','rb') as f:
    tfidf = pickle.load(f)

In [50]:
ex = ["You will receive an inconspicuous looking text from 'your bank.'"]

In [51]:
ex = tfidf.transform(ex).toarray()

In [52]:
print(clf.predict(ex))

[1]


In [53]:
print(ada.predict(ex))

[1]
