In [1]:
import pandas as pd
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv('/workspaces/NLP-sesion/IMDBDataset.csv', nrows=3500)

In [4]:
def data_cleaning(text_list): 
    #Stop words will be removed from the text
    stopwords_rem = False
    #It creates a list of strings with the english stopwords
    stopwords_en=stopwords.words('english')
    #It takes the base of the word provided.
    lemmatizer=WordNetLemmatizer()
    #It divides text into words.
    tokenizer=TweetTokenizer()
    reconstructed_list=[]
    for each_text in text_list: 
        lemmatized_tokens=[]
        #Converts to lowercase and then tokenizes
        tokens=tokenizer.tokenize(each_text.lower())
        #Each word is tagged with it's type, verb, noun,... It returns a tuple with the word and it's type
        pos_tags=pos_tag(tokens)
        for each_token, tag in pos_tags: 
            if tag.startswith('NN'): 
                pos='n'
            elif tag.startswith('VB'): 
                pos='v'
            else: 
                pos='a'
            lemmatized_token=lemmatizer.lemmatize(each_token, pos)
            if stopwords_rem: # False 
                if lemmatized_token not in stopwords_en: 
                    lemmatized_tokens.append(lemmatized_token)
            else: 
                lemmatized_tokens.append(lemmatized_token)
        reconstructed_list.append(' '.join(lemmatized_tokens))
    return(reconstructed_list)

In [5]:
#This is a pipeline that cleans the text with the function data_cleaning and then it creates a Term Frequency-Inverse Document Frequency matrix
# This matrix holds in its rows each text string and each column is a feature (unigram or bigram)
estimators = [('cleaner', FunctionTransformer(data_cleaning)),  ('vectorizer', TfidfVectorizer(max_features=100000, ngram_range=(1, 2)))]
preprocessing_pipeline=Pipeline(estimators)
X = df['review']
y=df['sentiment']
X_train, X_test, y_train, y_test=train_test_split(X, y)
X_train_transformed=preprocessing_pipeline.fit_transform(X_train)

In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_transformed, y_train)

In [7]:
X_test_transformed=preprocessing_pipeline.transform(X_test)
y_pred = model.predict(X_test_transformed)

### Results

In [8]:
acuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acuracy * 100:.2f}%')

Accuracy: 84.00%


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.83      0.84      0.84       426
    positive       0.85      0.84      0.84       449

    accuracy                           0.84       875
   macro avg       0.84      0.84      0.84       875
weighted avg       0.84      0.84      0.84       875



In [35]:
test_sentence = ["Sexist, creepy garbage. The writer was obviously obsessed with breasts! Full of locker room jokes. Feels extremely dated."]
test_sentence_tfidf  = preprocessing_pipeline.transform(test_sentence)
predictions = model.predict(test_sentence_tfidf)

print(predictions)

['negative']
