In [1]:
### IMPORTING DATASET

# We will use the load_files function from the sklearn_datasets library to import the dataset into our application. The load_files function automatically divides the dataset into data and target sets. The load_files will treat each folder inside the "txt_sentoken" folder as one category.

from sklearn.datasets import load_files

movie_data = load_files("../Desktop/GermCor")
x, y = movie_data.data, movie_data.target

# In the code above, the load_files function loads the data from both "neg" and "pos" folders into the x variable, while the target categories are stored in y. Here x is a list of 2000 string type elements where each element corresponds to single user review. Similarly, y is a numpy array of size 2000. If you print y on the screen, you will see an array of 1s and 0s. This is because, for each category, the load_files function adds a number to the target numpy array. We have two categories: "neg" and "pos", therefore 1s and 0s have been added to the target array.


# =====================================================


### TEXT PREPROCESSING

# Once the dataset has been imported, the next step is to preprocess the text. Text may contain numbers, special characters, and unwanted spaces. Depending upon the problem we face, we may or may not need to remove these special characters and numbers from text. However, for the sake of explanation, we will remove all the special characters, numbers, and unwanted spaces from our text.

# First download wordnet:
import nltk
nltk.download("wordnet")

documents = []

import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(x)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(x[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Tokenization
    document = document.split()

    # Lemmatization
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

# In the code above we use Regex Expressions from Python re library to perform different preprocessing tasks. We start by removing all non-word characters such as special characters, numbers, etc.
# Next, we remove all the single characters. For instance, when we remove the punctuation mark from "David's" and replace it with a space, we get "David" and a single character "s", which has no meaning. To remove such single characters we use \s+[a-zA-Z]\s+ regular expression which substitutes all the single characters having spaces on either side, with a single space.
# Next, we use the \^[a-zA-Z]\s+ regular expression to replace a single character from the beginning of the document, with a single space. Replacing single characters with a single space may result in multiple spaces, which is not ideal.
# We again use the regular expression \s+ to replace one or more spaces with a single space. When you have a dataset in bytes format, the alphabet letter "b" is appended before every string. The regex ^b\s+ removes "b" from the start of a string. The next step is to convert the data to lower case so that the words that are actually the same but have different cases can be treated equally.
# The final preprocessing step is the lemmatization. In lemmatization, we reduce the word into dictionary root form. For instance "cats" is converted into "cat". Lemmatization is done in order to avoid creating features that are semantically similar but syntactically different. For instance, we don't want two different features named "cats" and "cat", which are semantically similar, therefore we perform lemmatization.


# =====================================================


### TEXT TO NUMBERS

# Machines, unlike humans, cannot understand the raw text. Machines can only see numbers. Particularly, statistical techniques such as machine learning can only deal with numbers. Therefore, we need to convert our text into numbers.

# The following code uses the bag of words model to convert text documents into corresponding numerical features:

from sklearn.feature_extraction.text import CountVectorizer
# Download stopwords corpus:
nltk.download('stopwords')
from nltk.corpus import stopwords

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = vectorizer.fit_transform(documents).toarray()

# The code above uses CountVectorizer class from the sklearn.feature_extraction.text library.
# The first parameter is the max_features parameter, which is set to 1500. This is because when you convert words to numbers using the bag of words approach, all the unique words in all the documents are converted into features. All the documents can contain tens of thousands of unique words. But the words that have a very low frequency of occurrence are unusually not a good parameter for classifying documents. Therefore we set the max_features parameter to 1500, which means that we want to use 1500 most occurring words as features for training our classifier.
# The next parameter is min_df and it has been set to 5. This corresponds to the minimum number of documents that should contain this feature. So we only include those words that occur in at least 5 documents. Similarly, for the max_df, feature the value is set to 0.7; in which the fraction corresponds to a percentage. Here 0.7 means that we should include only those words that occur in a maximum of 70% of all the documents. Words that occur in almost every document are usually not suitable for classification because they do not provide any unique information about the document.
# Finally, we remove the stop words from our text since, in the case of sentiment analysis, stop words may not contain any useful information. To remove the stop words we pass the stopwords object from the nltk.corpus library to the stop_wordsparameter.
# The fit_transform function of the CountVectorizer class converts text documents into corresponding numeric features.

# The bag of words approach works fine for converting text to numbers. However, it has one drawback. It assigns a score to a word based on its occurrence in a particular document. It doesn't take into account the fact that the word might also be having a high frequency of occurrence in other documents as well. TFIDF resolves this issue by multiplying the term frequency of a word by the inverse document frequency. The TF stands for "Term Frequency" while IDF stands for "Inverse Document Frequency".
# The term frequency is calculated as: (Number of Occurrences of a word)/(Total words in the document)  
# And the Inverse Document Frequency is calculated as: Log( (Total number of documents)/(Number of documents containing the word) )
# The TFIDF value for a word in a particular document is higher if the frequency of occurrence of that word is higher in that specific document but lower in all the other documents.

# The following code uses TFIDF to convert text documents into corresponding numerical features:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(documents).toarray()


# =====================================================


### SPLITING DATA INTO TRAINING AND TESTING SETS

# Like any other supervised machine learning problem, we need to divide our data into training and testing sets:

import math
import random
import numpy as np

perc = 0.8
limit = math.floor(len(x)*perc)
subset_indexes = random.sample(range( 0, len(x) ), limit)
train_mask = np.zeros(len(x), np.bool)
train_mask[subset_indexes] = 1
test_mask = np.ones(len(x), np.bool)
test_mask[subset_indexes] = 0

x_train = x[train_mask]
x_test  = x[test_mask]
y_train = y[train_mask]
y_test  = y[test_mask]

# The above code divides data into 20% test set and 80% training set.


# =====================================================


### TRAINING NAIVE BAYES

# To train our machine learning model using the Naive Bayes algorithm we will use GaussianNB class from the sklearn.naive_bayes library. The fit method of this class is used to train the algorithm. We need to pass the training data and training target sets to this method.

from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(x_train, y_train)

# Finally, to predict the sentiment for the documents in our test set we can use the predict method of the GaussianNB class as shown below:

y_pred = gnb_classifier.predict(x_test)

# print(y_pred)  # predicted sentiments
# print(y_test)  # real sentiments
# print(y_pred-y_test)  # zeros are correctly classified instances


# =====================================================


### EVALUATION

# We can use confusion_matrix, classification_report, and accuracy_score utilities from the sklearn.metrics library:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\n============ Naive Bayes")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


# =====================================================


### SAVING AND LOADING THE MODEL

# In the script above, our machine learning model did not take much time to execute. One of the reasons for the quick training time is the fact that we had a relatively smaller training set. We had 2000 documents, of which we used 80% (1600) for training. However, in real-world scenarios, there can be millions of documents. In such cases, it can take hours or even days (if you have slower machines) to train the algorithms. Therefore, it is recommended to save the model once it is trained.

# We can save our model as a pickle object in Python:

import pickle

with open('text_classifier', 'wb') as picklefile:
    pickle.dump(gnb_classifier, picklefile)
	
# Once you execute the above code, you can see the text_classifier file in your working directory. We have saved our trained model and we can use it later for directly making predictions, without training.

# To load the model, we can use the following code:

with open('text_classifier', 'rb') as training_model:
    gnb_classifier_loaded = pickle.load(training_model)


# =====================================================


### OTHER CLASSIFICATION ALGORITHMS


# Decision Tree

from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier(random_state=0)
tree_classifier.fit(x_train, y_train)

y_pred = tree_classifier.predict(x_test)
print("\n============ Decision Tree")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


# Random Forest

from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
forest_classifier.fit(x_train, y_train)

y_pred = forest_classifier.predict(x_test)
print("\n============ Random Forest")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))


# Support Vector Machine

from sklearn.svm import SVC

svm_classifier = SVC(gamma='scale')
svm_classifier.fit(x_train, y_train)

y_pred = svm_classifier.predict(x_test)
print("\n============ Support Vector Machine")
print("---- confusion_matrix:")
print(confusion_matrix(y_test,y_pred))
print("---- classification_report:")
print(classification_report(y_test,y_pred))
print("---- accuracy_score:")
print(accuracy_score(y_test,y_pred))




# =====================================================

### ZADATAK

# Kopirajte nekoliko movie review-ova sa imdb-a, te prediktajte sa nekim od algoritama dali su ti review-ovi pozitivni ili negativni.

r = """This is probably the highest 8/10 I could give for an 8/10 movie.

There are some very good moments in Endgame and that's coming from a long standing MCU fan growing up with these movies. In creating an extensive and emotional journey for a lot of the MCU characters, it absolutely succeeds.

Endgame ultimately is an excellent conclusion to over 10 years of film but I feel the plot's pacing and direction is really lacking to Infinity War, which 'trimmed the fat' and was much more focused and energetic. Granted, Endgame takes risks but this might be a disappointment to some fans with its slower pace that focuses on character development over action and intense sequences. The run time probably should be shorter or streamlined. Less cuts and motion blur during fight scenes would've made it easier to track fights as well.

Respect for the Russo Brothers for taking the direction they did with Endgame and it really does satisfy but Infinity War is overall a more entertaining movie. It's not the best MCU movie but I can't dispute its great conclusion."""


FileNotFoundError: [WinError 3] The system cannot find the path specified: '../Desktop/GermCor'