In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import KFold,cross_val_score


In [3]:
from datetime import datetime
import nltk

In [4]:
import string

from nltk import word_tokenize
from nltk import pos_tag

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stopwords = list(stopwords.words('english'))
punctuation_translator = str.maketrans('', '', string.punctuation)


def remove_punctuation(s):
    return s.translate(punctuation_translator)



def preprocess_text(s, tokenizer=None, remove_stopwords=True, remove_punctuation=True,
                    stemmer=None, lemmatizer=None, lowercase=True, return_type='str'):
    # Throw an error if both stemmer and lemmatizer are not None
    if stemmer is not None and lemmatizer is not None:
         raise ValueError("Stemmer and Lemmatizer cannot both be not None!")

    # Tokenization either with default tokenizer or user-specified tokenizer
    if tokenizer is None:
        token_list = word_tokenize(s)
    else:
        token_list = tokenizer.tokenize(s)

    # Stem or lemmatize if needed
    if lemmatizer is not None:
        token_list = lemmatize_token_list(lemmatizer, token_list)
    elif stemmer is not None:
        token_list = stem_token_list(stemmer, token_list)

    # Convert all tokens to lowercase if need
    if lowercase:
        token_list = [ token.lower() for token in token_list ]

    # Remove all stopwords if needed
    if remove_stopwords:
        token_list = [ token for token in token_list if not token in nltk_stopwords ]

    # Remove all punctuation marks if needed (note: also converts, e.g, "Mr." to "Mr")
    if remove_punctuation:
        token_list = [ ''.join(c for c in s if c not in string.punctuation) for s in token_list ]
        token_list = [ token for token in token_list if len(token) > 0 ] # Remove "empty" tokens

    if return_type == 'list':
        return token_list
    elif return_type == 'set':
        return set(token_list)
    else:
        return ' '.join(token_list)



def stem_token_list(stemmer, token_list):
    for idx, token in enumerate(token_list):
        token_list[idx] = stemmer.stem(token)
    return token_list


def lemmatize_token_list(lemmatizer, token_list):
    pos_tag_list = pos_tag(token_list)
    for idx, (token, tag) in enumerate(pos_tag_list):
        tag_simple = tag[0].lower() # Converts, e.g., "VBD" to "c"
        if tag_simple in ['n', 'v', 'j']:
            word_type = tag_simple.replace('j', 'a')
        else:
            word_type = 'n'
        lemmatized_token = lemmatizer.lemmatize(token, pos=word_type)
        token_list[idx] = lemmatized_token
    return token_list



#
# Everything below gets only executed when the file is explicitly being run
# and not when imported. This is useful for testing the functions.
#


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HARSHITA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df_sent_pos = pd.read_csv('rt-polarity.pos', sep='\t', header=None)
df_sent_neg = pd.read_csv('rt-polarity.neg', sep='\t', header=None)

In [6]:
df_sent_pos.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [7]:
df_sent_neg.head()

Unnamed: 0,0
0,"simplistic , silly and tedious ."
1,"it's so laddish and juvenile , only teenage bo..."
2,exploitative and largely devoid of the depth o...
3,[garbus] discards the potential for pathologic...
4,a visually flashy but narratively opaque and e...


In [8]:
sentences = []
sentences.extend(df_sent_neg[0].tolist())
sentences.extend(df_sent_pos[0].tolist())

In [9]:
sentences[0:5]

['simplistic , silly and tedious . ',
 "it's so laddish and juvenile , only teenage boys could possibly find it funny . ",
 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . ',
 '[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . ',
 'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . ']

In [10]:
sentences_preprocessed=['']*len(sentences)
for idx,sent in enumerate(sentences):
    sentences_preprocessed[idx]=preprocess_text(sent)

In [11]:
polarities=[]
polarities.extend([0]*len(df_sent_neg))
polarities.extend([1]*len(df_sent_pos))

In [12]:
sentences = np.array(sentences_preprocessed)
polarities =np.array(polarities)

In [13]:
combined = list(zip(sentences,polarities))
random.seed(1)
random.shuffle(combined)

In [14]:
combined[0:3]

[('fine rousing grated family film aimed mainly little kids plenty entertainment value keep grownups squirming seats',
  1),
 ('one look girl tight pants big tits turn stupid um nt basis entire plot', 0),
 ('s scariest guy ll see summer', 1)]

In [15]:
sentences[:],polarities[:]=zip(*combined)

In [16]:
sentences[0:3]

array(['fine rousing grated family film aimed mainly little kids plenty entertainment value keep grownups squirming seats',
       'one look girl tight pants big tits turn stupid um nt basis entire plot',
       's scariest guy ll see summer'], dtype='<U222')

In [17]:
polarities[0:3]

array([1, 0, 1])

In [18]:
train_test_ratio = 0.8
train_set_size = int(0.8*len(sentences))
X_train,X_test = sentences[:train_set_size],sentences[train_set_size:]
y_train,y_test = polarities[:train_set_size],polarities[train_set_size:]

In [19]:
print('Size of training set is {}'.format(len(X_train)))
print('Size of testing set is {}'.format(len(X_test)))

Size of training set is 8529
Size of testing set is 2133


In [20]:
tfidf_vectorizer=TfidfVectorizer(ngram_range=(1,1))

In [21]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
num_sentences,num_vocab = X_train_tfidf.shape

In [22]:
print(num_sentences)
print(num_vocab)

8529
17732


In [23]:
lr_classifier = LogisticRegression(solver="sag").fit(X_train_tfidf,y_train)


In [24]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [25]:
X_test_tfidf.shape

(2133, 17732)

In [26]:
y_pred = lr_classifier.predict(X_test_tfidf)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      1067
           1       0.78      0.79      0.78      1066

    accuracy                           0.78      2133
   macro avg       0.78      0.78      0.78      2133
weighted avg       0.78      0.78      0.78      2133



In [27]:
f1_scores_list = cross_val_score(LogisticRegression(solver="sag"),X_train_tfidf,y_train,cv=10,scoring='f1')

In [29]:
print(f1_scores_list)

[0.7587007  0.74829932 0.73309609 0.74970344 0.75413712 0.75910693
 0.75799087 0.77816492 0.7326969  0.74401914]


In [34]:
print(f1_scores_list.mean())
print(f1_scores_list.std())

0.7515915418363927
0.01277275069297799


In [37]:
type(LogisticRegression()).__name__

'LogisticRegression'

In [39]:
best_score = 0
best_classifier=None
best_ngram_size = 0

classifiers = [LinearSVC(),LogisticRegression(solver = "sag")]
ngram_sizes = [1,2,3,4]

for classifier in classifiers:
    for s in ngram_sizes:
        start_time = datetime.now()
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,s))
        X_train_tfidf= tfidf_vectorizer.fit_transform(X_train)
        f1_scores_list = cross_val_score(classifier,X_train_tfidf,y_train,cv=10,scoring='f1')
        avg_f1score = f1_scores_list.mean()
        time_elapsed = datetime.now()-start_time
        print('Classifier:{},ngram size:{} ==> f1-score:{:.3f} [{}]'.format(type(classifier).__name__,s,avg_f1score,time_elapsed))
        if avg_f1score>best_score:
            best_score = avg_f1score
            best_classifier = classifier
            best_ngram_size = s
            



Classifier:LinearSVC,ngram size:1 ==> f1-score:0.749 [0:00:00.467680]
Classifier:LinearSVC,ngram size:2 ==> f1-score:0.759 [0:00:00.890178]
Classifier:LinearSVC,ngram size:3 ==> f1-score:0.759 [0:00:01.180116]
Classifier:LinearSVC,ngram size:4 ==> f1-score:0.756 [0:00:01.692555]
Classifier:LogisticRegression,ngram size:1 ==> f1-score:0.752 [0:00:00.607652]
Classifier:LogisticRegression,ngram size:2 ==> f1-score:0.750 [0:00:01.403319]
Classifier:LogisticRegression,ngram size:3 ==> f1-score:0.745 [0:00:02.010221]
Classifier:LogisticRegression,ngram size:4 ==> f1-score:0.745 [0:00:02.672036]


In [40]:
print(best_score)
print(type(best_classifier).__name__)
print(best_ngram_size)

0.7593098459406179
LinearSVC
2


In [41]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,best_ngram_size))

In [43]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [45]:
best_classifier = best_classifier.fit(X_train_tfidf,y_train)
y_pred = best_classifier.predict(X_test_tfidf)

In [48]:
print(classification_report(y_test,y_pred))
print('Accuracy:{:.3f}'.format(accuracy_score(y_test,y_pred)))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78      1067
           1       0.77      0.80      0.79      1066

    accuracy                           0.78      2133
   macro avg       0.78      0.78      0.78      2133
weighted avg       0.78      0.78      0.78      2133

Accuracy:0.784


In [49]:
print(y_pred[:3])

[1 1 0]


In [50]:
print(y_test[:3])

[0 0 0]
