Positive/Negative Liste bereitgestellt von:
;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
;       Proceedings of the ACM SIGKDD International Conference on Knowledge 
;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
;       Washington, USA, 
;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing 
;       and Comparing Opinions on the Web." Proceedings of the 14th 
;       International World Wide Web conference (WWW-2005), May 10-14, 
;       2005, Chiba, Japan.

In [192]:
#Load the libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re
# https://online.stat.psu.edu/stat504/lesson/1/1.7
from sklearn.metrics import classification_report,accuracy_score
import swifter
import os
import warnings

In [15]:
positive_words = pd.read_csv('data/positive-words.txt', skiprows=29, header=None)
positive_words

Unnamed: 0,0
0,a+
1,abound
2,abounds
3,abundance
4,abundant
...,...
2001,youthful
2002,zeal
2003,zenith
2004,zest


In [21]:
negative_words = pd.read_csv('data/negative-words.txt', skiprows=29, header=None)
negative_words

Unnamed: 0,0
0,2-faced
1,2-faces
2,abnormal
3,abolish
4,abominable
...,...
4778,zaps
4779,zealot
4780,zealous
4781,zealously


In [22]:
#importing the training data
imdb_data=pd.read_csv('data/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [26]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
imdb_data.head(10)
imdb_data['review']=imdb_data['review'].apply(denoise_text)

In [27]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive
5,Probably my alltime favorite movie a story of ...,positive
6,I sure would like to see a resurrection of a u...,positive
7,This show was an amazing fresh innovative ide...,negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
#Stemming the text
'''
Stemming (stem form reduction, normal form reduction) is the term used in information retrieval as well as in linguistic computer science to describe a procedure 
by which different morphological variants of a word are reduced to their common root, e.g. the declension of Wortes or words to Wort and conjugation of "gesehen" or "sah" to "seh". 
'''
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)
imdb_data.head(10)

In [29]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [134]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)
imdb_data.head(10)

{'where', 'wouldn', 'didn', 'below', 'with', 'her', 'on', 'herself', 'themselves', 'before', 'only', 'because', "won't", 'to', 'd', 'some', "isn't", 'as', 'needn', 'being', 'under', 'once', 'same', 'hers', 'is', 'those', "should've", 'me', 'she', 'there', 'when', 'so', 'after', 'and', 'such', 'shouldn', "wasn't", 'than', 'ain', 'myself', 'how', 'aren', 'over', 'an', 'our', 'until', 'from', 'why', 'further', 've', 'will', 'mustn', 'theirs', 'by', 'in', "it's", "mustn't", 'what', 'we', "weren't", 'y', 'who', 'been', "that'll", 'm', 'not', 'own', "don't", "she's", 'these', 'o', "you'd", 'don', 'but', 'my', "you'll", 're', 'll', "hadn't", "shan't", 'other', 'of', 'am', 'ourselves', "aren't", "mightn't", "wouldn't", 'whom', 'isn', 'which', 'then', 'them', 'most', "needn't", 'can', 'through', 'against', 'up', 'between', 's', 'ma', 'have', 'their', 'yourself', 'does', 'weren', 'he', 'a', 'ours', 'or', 'had', 'too', 't', "you've", 'no', 'yourselves', 'it', 'yours', 'into', 'be', 'this', 'itsel

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
5,probably alltime favorite movie story selfless...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea 70s first a...,negative
8,encouraged positive comments film looking forw...,negative
9,like original gut wrenching laughter like movi...,positive


In [135]:
norm_train_reviews=imdb_data.iloc[:40000]
norm_test_reviews=imdb_data.iloc[40000:]

In [200]:
np.intersect1d(norm_train_reviews.loc[2, 'review'].split(), positive_words.values)

array(['great', 'hot', 'impressed', 'interesting', 'likable', 'right',
       'sexy', 'spirited', 'well', 'witty', 'wonderful'], dtype=object)

In [213]:
def check_sentiment_by_counting(tokens, positive=True, negative=True, return_as_str=False, threshold=0):
    if positive:
        positive_n = len(np.intersect1d(tokens.split(), positive_words.values))
    if negative: 
        negative_n = len(np.intersect1d(tokens.split(), negative_words.values))
    if return_as_str:
        return 'positive' if positive_n - negative_n > threshold else 'negative'
    if positive:
        return positive_n
    if negative:
        return negative_n

def count_positive_negative_words(df):
    positive = df['review'].swifter.apply(check_sentiment_by_counting, positive=True, negative=False)
    negative = df['review'].swifter.apply(check_sentiment_by_counting, positive=False, negative=True)
    print("Positive and Negative Words: ", positive.sum(), negative.sum())
    return positive, negative


In [None]:
positive, negative = count_positive_negative_words(norm_test_reviews)

In [215]:
norm_train_reviews['sentiment_pred'] = norm_train_reviews['review'].swifter.apply(check_sentiment_by_counting, return_as_str=True, threshold=81185/80551)

Pandas Apply: 100%|██████████| 40000/40000 [03:34<00:00, 186.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  norm_train_reviews['sentiment_pred'] = norm_train_reviews['review'].swifter.apply(check_sentiment_by_counting, return_as_str=True, threshold=81185/80551)


In [217]:
accuracy_score(norm_train_reviews['sentiment_pred'], norm_train_reviews['sentiment'])

0.727125

In [218]:
norm_train_reviews['sentiment_pred'].value_counts()

negative    23856
positive    16144
Name: sentiment_pred, dtype: int64

In [219]:
#Classification report for tfidf features
lr_tfidf_report=classification_report(norm_train_reviews['sentiment_pred'], norm_train_reviews['sentiment'],target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.82      0.69      0.75     23856
    Negative       0.63      0.78      0.70     16144

    accuracy                           0.73     40000
   macro avg       0.73      0.74      0.72     40000
weighted avg       0.75      0.73      0.73     40000



In [230]:
#importing the training data
imdb_data=pd.read_csv('data/IMDB Dataset.csv')
print(imdb_data.shape)

(50000, 2)


In [236]:
idx = norm_train_reviews[norm_train_reviews['sentiment_pred']!= norm_train_reviews['sentiment']].sample(1).index
imdb_data.iloc[:40000].iloc[idx].values

        'positive']], dtype=object)

In [239]:
norm_train_reviews.iloc[idx].values

        'positive', 'negative']], dtype=object)