In [2]:
import pandas as pd
import numpy as np
import pickle
import nltk
#nltk.download()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [3]:
path = 'dataset/trainingdata.csv'
data = pd.read_csv(path)
data = data.dropna()

In [4]:
data.head()

Unnamed: 0,sentiments,sentences
0,-1.0,I can't understand the method of teaching
1,1.0,The instructor was interested in the students ...
2,-1.0,The instructor don't use any examples for expl...
3,0.0,Teaching is good but always late to class
4,-1.0,Explaination is poor


In [5]:
# nltk.download('stopwords')
# data['sentences'] = data['sentences'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# #Removing Punctuation, Symbols
# data['sentences'] = data['sentences'].str.replace('[^\w\s]',' ')

# #Removing Stop Words using NLTK
# from nltk.corpus import stopwords
# stop = stopwords.words('english')
# data['sentences'] = data['sentences'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [6]:
# # Lemmatisation
# !pip install textblob
# nltk.download('textblob')
# nltk.download('wordnet')
# from textblob import Word
# data['sentences'] = data['sentences'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# #Correcting Letter Repetitions
# import re
# def de_repeat(text):
#     pattern = re.compile(r"(.)\1{2,}")
#     return pattern.sub(r"\1\1", text)

# #%%
# data['sentences'] = data['sentences'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [7]:
# extract the labels from the train data
y = data.sentiments.values

# use 80% for the training and 20% for the test
x_train, x_test, y_train, y_test = train_test_split(data.sentences.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.2, shuffle=True)

In [8]:
# Extracting TF-IDF parameters
from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
# xtrain_tfidf = tfidf.fit_transform(x_train)
# xtest_tfidf = tfidf.fit_transform(x_test)

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [10]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',
                             max_features=2000)
tfidf_vect.fit(data['sentences'])
xtrain_tfidf =  tfidf_vect.transform(x_train)
xtest_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', 
                                   ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(data['sentences'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',
                                         ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(data['sentences'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_train) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_test) 

In [11]:
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(xtrain_tfidf, y_train)
y_pred_nb = nb.predict(xtest_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred_nb, y_test))

naive bayes tfidf accuracy 0.75564681724846


In [12]:
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)
lsvm.fit(xtrain_tfidf, y_train)
y_pred_lsvm = lsvm.predict(xtest_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred_lsvm, y_test))

svm using tfidf accuracy 0.8501026694045175


In [13]:
# Model 3: logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(xtrain_tfidf_ngram_chars, y_train)
y_pred_logreg = logreg.predict(xtest_tfidf_ngram_chars)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred_logreg, y_test))

log reg tfidf accuracy 0.8316221765913757


In [14]:
# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(xtrain_tfidf_ngram_chars, y_train)
y_pred_rf = rf.predict(xtest_tfidf_ngram_chars)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred_rf, y_test))

random forest tfidf accuracy 0.8459958932238193


In [15]:
#Model 5: SVM classifier
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(xtrain_tfidf, y_train)
y_pred_svm = clf_svm.predict(xtest_tfidf)
print('svm tfidf accuracy %s' % accuracy_score(y_pred_svm, y_test))

svm tfidf accuracy 0.86652977412731


In [16]:
#Model 6: Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(xtrain_tfidf, y_train)
y_pred_dec = clf_dec.predict(xtest_tfidf)
print('decision tree tfidf accuracy %s' % accuracy_score(y_pred_dec, y_test))

decision tree tfidf accuracy 0.813141683778234


In [17]:
# pickle.dump(nb,open('Multinomial_Naive_Bayes_Classifier.pkl',"wb"))
# pickle.dump(lsvm,open('SGD_Classifier.pkl',"wb"))
# pickle.dump(logreg,open('Logistic_regression_Classifier.pkl',"wb"))
# pickle.dump(rf,open('Random_forest_Classifier.pkl',"wb"))
# pickle.dump(clf_svm,open('SVM_Classifier.pkl',"wb"))
# pickle.dump(clf_dec,open('Decision_tree_Classifier.pkl',"wb"))


In [18]:
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred_nb, y_test))
print('svm using tfidf accuracy %s' % accuracy_score(y_pred_lsvm, y_test))
print('logistic reg tfidf accuracy %s' % accuracy_score(y_pred_logreg, y_test))
print('random forest tfidf accuracy %s' % accuracy_score(y_pred_rf, y_test))
print('svm tfidf accuracy %s' % accuracy_score(y_pred_svm, y_test))
print('decision tree tfidf accuracy %s' % accuracy_score(y_pred_dec, y_test))

naive bayes tfidf accuracy 0.75564681724846
svm using tfidf accuracy 0.8501026694045175
logistic reg tfidf accuracy 0.8316221765913757
random forest tfidf accuracy 0.8459958932238193
svm tfidf accuracy 0.86652977412731
decision tree tfidf accuracy 0.813141683778234


In [21]:
def testing(sample):
  sample = pd.DataFrame([sample])
  # sample[0] = sample[0].str.replace('[^\w\s]',' ')
  # stop = stopwords.words('english')
  # sample[0] = sample[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
  # sample[0] = sample[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
  # tfidf_vect.fit(data['sentences'])
  sample =  tfidf_vect.transform(sample[0])
  sample_pred = clf_svm.predict(sample)
  print(sample_pred)

testing("fine")

[1.]
