# NATURAL LANGUAGE PROCESSING

### Importing libraries

In [1]:
import nltk
import spacy # MAY BE NOT NEEDED
import string
import re # MAY BE NOT NEEDED

## Loading and preparing the files

I've found so far two ways of doing it, but one of them does not close the file, so I will put it as it is simpler, and really useful when it comes to understanding what we are doing, but also much less efficient, so I'll be using both:

### First way (Easy one):

In [2]:
#Necessary encoding, otherwise it will not work
train_file = open('../data/movie_data/full_train.txt', 'r', encoding="utf8") 
test_file = open('../data/movie_data/full_test.txt', 'r', encoding="utf8")

### Transforming the files into readable ones

The 'train_file' type is a '_io.TextIOWrapper' object, which can not be iterated as it has no index. However, you can do create a list of 'lines'

In [3]:
reviews_train = [review.strip() for review in train_file]
reviews_test = [review.strip() for review in test_file]

### Printing the list results

Now we will be printing the first item of the new lists created "reviews_train" and "reviews_test". being each one a complete review, so the lists should have the total number of reviews:

In [4]:
print(f'TRAINING DATA: \n\n    First paragraph:\n\n{reviews_train[0]}\
                \n\n Number of training reviews: {len(reviews_train)}')
print('\n')
print(f'TESTING DATA: \n\n    First paragraph:\n\n{reviews_test[0]}\
                \n\n Number oftesting reviews: {len(reviews_test)}')

TRAINING DATA: 

    First paragraph:

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!                

 Number of training reviews: 25000


TESTING DATA: 

    First paragraph:

I went and saw this movie last night after being coaxed to by 

### Second way (Efficient one):

So in this case, we will create the lists in the same cells by using the 'with' 'using statement', which will close the file after the action it is meant to perform

In [5]:
with open('../data/movie_data/full_train.txt', 'r', encoding = "utf8") as train_file:
    reviews_train = [review.strip() for review in train_file]
    
with open('../data/movie_data/full_test.txt', 'r', encoding = "utf8") as test_file:
    reviews_test = [review.strip() for review in test_file]

So we just need to print it:

In [6]:
print(f'TRAINING DATA: \n\n    First paragraph:\n\n{reviews_train[0]}\
                \n\n Number of training reviews: {len(reviews_train)}')
print('\n')
print(f'TESTING DATA: \n\n    First paragraph:\n\n{reviews_test[0]}\
                \n\n Number oftesting reviews: {len(reviews_test)}')

TRAINING DATA: 

    First paragraph:

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!                

 Number of training reviews: 25000


TESTING DATA: 

    First paragraph:

I went and saw this movie last night after being coaxed to by 

# Warning!!

### *Run the first one only, and just only to understand how the code is working, once you have done it ignore it and run the second for better performance*

# COPIED

In [None]:
nlp = spacy.load('en')

In [None]:
reviews_train = []
for line in open('../data/movie_data/full_train.txt', 'r', encoding="utf8"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('../data/movie_data/full_test.txt', 'r', encoding="utf8"):
    reviews_test.append(line.strip())

In [None]:
reviews_train[0]

In [None]:


REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [None]:
reviews_train_clean[0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
#     Accuracy for C=0.01: 0.87472
#     Accuracy for C=0.05: 0.88368
#     Accuracy for C=0.25: 0.88016
#     Accuracy for C=0.5: 0.87808
#     Accuracy for C=1: 0.87648

In [None]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
#     ('excellent', 0.9288812418118644)
#     ('perfect', 0.7934641227980576)
#     ('great', 0.675040909917553)
#     ('amazing', 0.6160398142631545)
#     ('superb', 0.6063967799425831)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)
    
#     ('worst', -1.367978497228895)
#     ('waste', -1.1684451288279047)
#     ('awful', -1.0277001734353677)
#     ('poorly', -0.8748317895742782)
#     ('boring', -0.8587249740682945)