In [1]:
# open csv file
import pandas as pd
train = pd.read_csv('../Data/IMDBsentiment/IMDBsentiment/Train.csv')
test = pd.read_csv('../Data/IMDBsentiment/IMDBsentiment/Test.csv')
valid = pd.read_csv('../Data/IMDBsentiment/IMDBsentiment/Valid.csv')

# show first 5 rows
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [2]:
# remove all characters except a-z, A-Z, 0-9, äöüÄÖÜß, ., !, ?
# remove double spaces
# convert to lower case
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß.,!?]', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    text = text.lower()
    return text

In [3]:
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)
valid['text'] = valid['text'].apply(clean_text)

# show first 5 rows
train.head()

Unnamed: 0,text,label
0,i grew up b. 1965 watching and loving the thun...,0
1,"when i put this movie in my dvd player, and sa...",0
2,why do people who do not know what a particula...,0
3,even though i have great interest in biblical ...,0
4,im a die hard dads army fan and nothing will e...,1


In [4]:
# split the data into features and labels
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']
X_valid, y_valid = valid['text'], valid['label']

In [None]:
# Implement a binary text classifier using Multinomial Naive Bayes. You are allowed to use the classes from the scikitlearn library.
import sklearn

# create a pipeline that transforms the text into a vector representation and trains a classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', CountVectorizer()),
                        ('clf', MultinomialNB()),
])

# train the classifier with grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__analyzer': ['word'],
    'vect__ngram_range':[(2, 2)],
    'vect__max_df': [0.5],
    'vect__max_features': [None],
    'clf__alpha': [1],
}

# best parameters: {'clf__alpha': 1, 'vect__max_df': 0.5, 'vect__max_features': None}


grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
# print the best parameters
print(grid_search.best_params_)
print(grid_search.best_score_)

# best model
text_clf = grid_search.best_estimator_

# predict the sentiment of the test set
y_pred = text_clf.predict(X_test)

# print the accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

# print the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

# print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

{'clf__alpha': 1, 'vect__analyzer': 'word', 'vect__max_df': 0.5, 'vect__max_features': None, 'vect__ngram_range': (2, 2)}
0.8900750000000001
0.8996
[[2259  236]
 [ 266 2239]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2495
           1       0.90      0.89      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



In [None]:
# Experiment with different expressions of the Bag-of-Words (BOW). These include the discrete values of the absolute word frequencies and the TF-IDF scores of the words. Consider the complete training set training set as a corpus and the respective reviews as individual documents. as individual documents. You are allowed to use the TF-IDF Vectorizer methods to be used.

# create a pipeline that transforms the text into a vector representation and trains a classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', TfidfVectorizer()),
                        ('chi2', SelectKBest(chi2)),
                        ('clf', MultinomialNB()),
])

# train the classifier with grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__analyzer': ['word'],
    'vect__ngram_range':[(2, 2)],
    'vect__max_df': [0.5],
    'vect__max_features': [None],
    'clf__alpha': [1],
    'chi2__k': [ 5000 ]
}

# best parameters: {'chi2__k': 5000, 'clf__alpha': 1, 'vect__analyzer': 'word', 'vect__max_df': 0.5, 'vect__max_features': None, 'vect__ngram_range': (2, 2)}

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [None]:
# print the best parameters
print(grid_search.best_params_)
print(grid_search.best_score_)

# best model
text_clf = grid_search.best_estimator_

# predict the sentiment of the test set
y_pred = text_clf.predict(X_test)

# print the accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

# print the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

# print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [55]:
# python -m spacy download en_core_web_sm
import spacy

# POS filter
nlp = spacy.load('en_core_web_sm')
X_NOUN = []
X_ADJ = []
X_ADV = []
for text in X_train:
    doc = nlp(text)
    X_NOUN.append(' '.join([token.text for token in doc if token.pos_ in ['NOUN']]))
    X_ADJ.append(' '.join([token.text for token in doc if token.pos_ in ['ADJ']]))
    X_ADV.append(' '.join([token.text for token in doc if token.pos_ in ['ADV']]))

In [57]:
import pickle

with open('./Data/X_NOUN.pkl', 'wb') as f:
    pickle.dump(X_NOUN, f)

with open('./Data/X_ADJ.pkl', 'wb') as f:
    pickle.dump(X_ADJ, f)

with open('./Data/X_ADV.pkl', 'wb') as f:
    pickle.dump(X_ADV, f)

# load the data
# with open('X_NOUN.pkl', 'rb') as f:
#     X_NOUN = pickle.load(f)

# with open('X_ADJ.pkl', 'rb') as f:
#     X_ADJ = pickle.load(f)

# with open('X_ADV.pkl', 'rb') as f:
#     X_ADV = pickle.load(f)


print(X_NOUN[:1], X_ADJ[:1], X_ADV[:1], sep='\n')

['thunderbirds mates school thunderbirds school lunch school scott one art form children movie glimpse child point theme tune score thunderbirds mornings television channel reruns series wife jonatha frakes directors chair version waste film remake marionettes homo subsp error judgment']
['virgil disappointing only high snappy original early hopeless utter acceptable sapiens huge']
['bitterly thankfully still completely']


In [60]:
# Check if using a POS tag filter gives better results (e.g. only nouns, adjectives and adverbs). results (e.g. only nouns, adjectives and adverbs). The spaCy library can be used for this purpose

# create a pipeline that transforms the text into a vector representation and trains a classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, TransformerMixin



text_clf = Pipeline([('vect', TfidfVectorizer()),
                        ('chi2', SelectKBest(chi2)),
                        ('clf', MultinomialNB()),
])

# train the classifier with grid search
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__analyzer': ['word'],
    'vect__ngram_range':[(2, 2)],
    'vect__max_df': [0.3, 0.5, 0.7],
    'vect__max_features': [None, 5000],
    'clf__alpha': [0.1, 0.5, 1],
    'chi2__k': [ 2000, 5000, 7500 ]
}

grid_search_NOUN = grid_search_ADJ = grid_search_ADV = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
grid_search_NOUN.fit(X_NOUN, y_train)
grid_search_ADJ.fit(X_ADJ, y_train)
grid_search_ADV.fit(X_ADV, y_train)

# best model for each POS tag
text_clf_NOUN = grid_search_NOUN.best_estimator_
text_clf_ADJ = grid_search_ADJ.best_estimator_
text_clf_ADV = grid_search_ADV.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


45 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline.py", line 402, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline

Fitting 5 folds for each of 54 candidates, totalling 270 fits


45 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline.py", line 402, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline

Fitting 5 folds for each of 54 candidates, totalling 270 fits


45 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline.py", line 402, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Programme\GitKrakenRepos\WK_2626_Applied_Natural_Language_Processing\venv_nlp\Lib\site-packages\sklearn\pipeline

In [62]:
# best parameters for each POS tag
print(grid_search_NOUN.best_params_)
print(grid_search_ADJ.best_params_)
print(grid_search_ADV.best_params_)

{'chi2__k': 7500, 'clf__alpha': 1, 'vect__analyzer': 'word', 'vect__max_df': 0.3, 'vect__max_features': None, 'vect__ngram_range': (2, 2)}
{'chi2__k': 7500, 'clf__alpha': 1, 'vect__analyzer': 'word', 'vect__max_df': 0.3, 'vect__max_features': None, 'vect__ngram_range': (2, 2)}
{'chi2__k': 7500, 'clf__alpha': 1, 'vect__analyzer': 'word', 'vect__max_df': 0.3, 'vect__max_features': None, 'vect__ngram_range': (2, 2)}


In [61]:
# compare the accuracy of the models
y_pred_NOUN = text_clf_NOUN.predict(X_NOUN)
y_pred_ADJ = text_clf_ADJ.predict(X_ADJ)
y_pred_ADV = text_clf_ADV.predict(X_ADV)



from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# print the accuracy
# print the confusion matrix

print('NOUN')
print(accuracy_score(y_train, y_pred_NOUN))
print(confusion_matrix(y_train, y_pred_NOUN), '\n')

print('ADJ')
print(confusion_matrix(y_train, y_pred_ADJ))
print(accuracy_score(y_train, y_pred_ADJ), '\n')

print('ADV')
print(accuracy_score(y_train, y_pred_ADV))
print(confusion_matrix(y_train, y_pred_ADV), '\n')


NOUN
0.500475
[[20017     2]
 [19979     2]] 

ADJ
[[19376   643]
 [19139   842]]
0.50545 

ADV
0.778075
[[16675  3344]
 [ 5533 14448]] 

