In [None]:
pip install tensorflow



In [None]:
pip install keras



In [None]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

In [None]:
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 20  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
a = pd.read_csv("finaldata.csv", engine='python')
a.head()

Unnamed: 0,Reviews,label
0,\t\t\t\t\t\t\tSamsung you love\t\t\t\t\t\t\t\t...,positive
1,\t\t\t\t\t\t\tMeanwhile apple introducing dark...,neautral
2,\t\t\t\t\t\t\tA fantastic pohne u\t\t\t\t\t\t\...,positive
3,\t\t\t\t\t\t\tHow much in Bangladesh?\t\t\t\t\...,positive
4,\t\t\t\t\t\t\tHow much GB storage does it have...,positive


In [None]:
import nltk
nltk.download('stopwords')
#from nltk.corpus import stopwords
#stopwords.words('english')
def remove_stopwords(input_text):
  stopwords_list = stopwords.words('english')
  whitelist = ["n't", "not", "no"]
  words = input_text.split() 
  clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
  return " ".join(clean_words) 
    
def remove_mentions(input_text):
  return re.sub(r'@\w+', '', input_text)
       
a.Reviews = a.Reviews.apply(remove_stopwords).apply(remove_mentions)
a.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Reviews,label
0,Samsung love,positive
1,Meanwhile apple introducing darkmode iPhones????,neautral
2,fantastic pohne,positive
3,How much Bangladesh?,positive
4,How much GB storage have?,positive


In [None]:
x_train, x_test, y_train, y_test = train_test_split(a.Reviews, a.label, test_size=0.1, random_state=37)
print('# Train data samples:', x_train.shape[0])
print('# Test data samples:', x_test.shape[0])
assert x_train.shape[0] == y_train.shape[0]
assert x_test.shape[0] == y_test.shape[0]

# Train data samples: 58651
# Test data samples: 6517


In [None]:
tk = Tokenizer(num_words=NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
tk.fit_on_texts(x_train)

In [None]:
print('Fitted tokenizer on {} documents'.format(tk.document_count))
print('{} words in dictionary'.format(tk.num_words))
print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))

Fitted tokenizer on 58651 documents
10000 words in dictionary
Top 5 most common words are: [('samsung', 12356), ('youtube', 10978), ('com', 9176), ('www', 7056), ('channel', 7052)]


In [None]:
x_train_seq = tk.texts_to_sequences(x_train)
x_test_seq = tk.texts_to_sequences(x_test) 
print('"{}" is converted into {}'.format(x_train[0], x_train_seq[0])) 

"Samsung love" is converted into [214, 118, 79, 1571, 140, 542, 2138]


In [None]:
def one_hot_seq(seqs, nb_features = NB_WORDS):
    ohs = np.zeros((len(seqs), nb_features))
    for i, s in enumerate(seqs):
        ohs[i, s] = 1.
    return ohs

x_train_oh = one_hot_seq(x_train_seq)
x_test_oh = one_hot_seq(x_test_seq)

print('"{}" is converted into {}'.format(x_train_seq[0], x_train_oh[0]))
print('For this example we have {} features with a value of 1.'.format(x_train_oh[0].sum()))

"[214, 118, 79, 1571, 140, 542, 2138]" is converted into [0. 0. 0. ... 0. 0. 0.]
For this example we have 7.0 features with a value of 1.


In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

print('"{}" is converted into {}'.format(y_train[0], y_train_le[0]))
print('"{}" is converted into {}'.format(y_train_le[0], y_train_oh[0]))

"positive" is converted into 0
"0" is converted into [1. 0. 0.]


In [None]:
print(x_train_oh)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]


In [None]:
print(y_train_le)

[0 0 2 ... 1 0 1]


In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train_oh, y_train_le)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [None]:
y_pred_dr = classifier.predict(x_test_oh)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test_le, y_pred_dr)
print(cm)
accuracy_score(y_test_le, y_pred_dr)

[[3621   27   69]
 [  45  790   92]
 [  93   87 1693]]


0.9366272824919442

In [None]:
from sklearn.svm import SVC
svc_ml = SVC(kernel = 'linear', random_state = 0)
svc_ml.fit(x_train_oh, y_train_le)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_ml = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_ml.fit(x_train_oh, y_train_le)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
y_pred_knn = knn_ml.predict(x_test_oh)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test_le, y_pred_knn)
print(cm)
accuracy_score(y_test_le, y_pred_knn)

[[3702    3   12]
 [ 354  535   38]
 [ 723  100 1050]]


0.8112628510050637

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcc = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rfcc.fit(x_train_oh, y_train_le)
y_pred_rfcc = rfcc.predict(x_test_oh)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test_le, y_pred_rfcc)
print(cm)
accuracy_score(y_test_le, y_pred_rfcc)

[[3665   11   41]
 [ 108  753   66]
 [ 160  114 1599]]


0.9232775817093755