In [43]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
import nltk
import string
from nltk.tokenize import ToktokTokenizer
import re 
import spacy
from sklearn.model_selection import GridSearchCV

In [2]:
print(os.listdir("./input"))

imdb = pd.read_csv('./input/imdb_labelled.txt', sep ='\t', header = None)

imdb.columns = ["sentence",  "sentiment"]

amazon = pd.read_csv('./input/amazon_cells_labelled.txt', sep ='\t', header = None)

amazon.columns = ["sentence",  "sentiment"]

yelp = pd.read_csv('./input/yelp_labelled.txt', sep ='\t', header = None)

yelp.columns = ["sentence",  "sentiment"]





['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']


In [3]:
#Strip functions

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')

# call function

def remove_punctuation(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

# function to remove stopwords
def remove_stopwords(text):
    # convert sentence into token of words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    t = [token for token in tokens if token.lower() not in stopword_list]
    text = ' '.join(t)    
    return text

# function to remove special characters
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

# function to remove special characters
def to_lowercase(text):
    return text.lower()

# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

nlp = spacy.load("en_core_web_sm")

# function to remove special characters
def get_lem(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


In [13]:


X = imdb['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_stopwords)
X = X.map(get_lem)
X = X.map(to_lowercase)


tfidf = TfidfVectorizer()
X = X
y = imdb['sentiment']
X = tfidf.fit_transform(X)




In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.67      0.78        84
           1       0.69      0.94      0.79        66

    accuracy                           0.79       150
   macro avg       0.81      0.80      0.79       150
weighted avg       0.83      0.79      0.79       150



In [11]:


X = amazon['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_stopwords)
#X = X.map(remove_punctuation)
X = X.map(to_lowercase)


tfidf = TfidfVectorizer(ngram_range = (1,4))
X = X
y = amazon['sentiment']
X = tfidf.fit_transform(X)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC(C=10)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.88      0.85        97
           1       0.88      0.83      0.85       103

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



In [17]:


X = yelp['sentence'].map(remove_extra_whitespace_tabs)
X = X.map(remove_punctuation)
#X = X.map(remove_stopwords)
X = X.map(get_lem)
X = X.map(to_lowercase)


tfidf = TfidfVectorizer()
X = X
y = yelp['sentiment']
X = tfidf.fit_transform(X)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

clf = LinearSVC()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85        97
           1       0.88      0.83      0.85       103

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



In [48]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['poly'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},]

print(sklearn.metrics.SCORERS.keys())

scores = ['f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei