In [221]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
import nltk
import string
from nltk.tokenize import ToktokTokenizer
import re 
import spacy

In [16]:
print(os.listdir("./input"))

imdb = pd.read_csv('./input/imdb_labelled.txt', sep ='\t', header = None)

imdb.columns = ["sentence",  "sentiment"]

amazon = pd.read_csv('./input/amazon_cells_labelled.txt', sep ='\t', header = None)

amazon.columns = ["sentence",  "sentiment"]

yelp = pd.read_csv('./input/yelp_labelled.txt', sep ='\t', header = None)

yelp.columns = ["sentence",  "sentiment"]





['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']


In [117]:
#Strip functions

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')

# call function

def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

# function to remove stopwords
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

# function to remove special characters
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

# function to remove special characters
def to_lowercase(text):
    return text.lower()

# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

nlp = spacy.load("en_core_web_sm")

# function to remove special characters
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


In [208]:

print(imdb[:10])
X = imdb['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_stopwords)
X = X.map(get_lem)
X = X.map(to_lowercase)
print(X[:10])

tfidf = TfidfVectorizer()
X = X
y = imdb['sentiment']
X = tfidf.fit_transform(X)




                                            sentence  sentiment
0  A very, very, very slow-moving, aimless movie ...          0
1  Not sure who was more lost - the flat characte...          0
2  Attempting artiness with black & white and cle...          0
3       Very little music or anything to speak of.            0
4  The best scene in the movie was when Gerardo i...          1
5  The rest of the movie lacks art, charm, meanin...          0
6                                Wasted two hours.            0
7  Saw the movie today and thought it was a good ...          1
8                               A bit predictable.            0
9  Loved the casting of Jimmy Buffet as the scien...          1
0    , , slow - move , aimless movie distress , dri...
1    not sure lose - flat character audience , near...
2    attempt artiness black & amp ; white clever ca...
3                        little music anything speak .
4    good scene movie gerardo try find song keep ru...
5    rest movie lack 

In [251]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.67      0.78        84
           1       0.69      0.94      0.79        66

    accuracy                           0.79       150
   macro avg       0.81      0.80      0.79       150
weighted avg       0.83      0.79      0.79       150



In [200]:

print(amazon[:10])
X = amazon['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_stopwords)
#X = X.map(remove_punctuation)
X = X.map(to_lowercase)
print(X[:10])

tfidf = TfidfVectorizer(ngram_range = (1,4))
X = X
y = amazon['sentiment']
X = tfidf.fit_transform(X)

                                            sentence  sentiment
0  So there is no way for me to plug it in here i...          0
1                        Good case, Excellent value.          1
2                             Great for the jawbone.          1
3  Tied to charger for conversations lasting more...          0
4                                  The mic is great.          1
5  I have to jiggle the plug to get it to line up...          0
6  If you have several dozen or several hundred c...          0
7        If you are Razr owner...you must have this!          1
8                Needless to say, I wasted my money.          0
9                   What a waste of money and time!.          0
0                    way plug us unless go converter .
1                        good case , excellent value .
2                                      great jawbone .
3    tied charger conversations lasting 45 minutes....
4                                          mic great .
5       jiggle plug g

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.67      0.78        84
           1       0.69      0.94      0.79        66

    accuracy                           0.79       150
   macro avg       0.81      0.80      0.79       150
weighted avg       0.83      0.79      0.79       150



In [194]:

print(yelp[:10])
X = yelp['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_punctuation)
#X = X.map(remove_stopwords)
X = X.map(get_lem)
X = X.map(to_lowercase)
print(X[:10])

tfidf = TfidfVectorizer()
X = X
y = yelp['sentiment']
X = tfidf.fit_transform(X)

                                            sentence  sentiment
0                           Wow... Loved this place.          1
1                                 Crust is not good.          0
2          Not tasty and the texture was just nasty.          0
3  Stopped by during the late May bank holiday of...          1
4  The selection on the menu was great and so wer...          1
5     Now I am getting angry and I want my damn pho.          0
6              Honeslty it didn't taste THAT fresh.)          0
7  The potatoes were like rubber and you could te...          0
8                          The fries were great too.          1
9                                     A great touch.          1
0                                  wow love this place
1                                    crust be not good
2              not tasty and the texture be just nasty
3    stop by during the late may bank holiday off r...
4    the selection on the menu be great and so be t...
5            now i be

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.67      0.78        84
           1       0.69      0.94      0.79        66

    accuracy                           0.79       150
   macro avg       0.81      0.80      0.79       150
weighted avg       0.83      0.79      0.79       150

