# Libraries and functions

In [None]:
DEVELOPMENT_FILE = "./dataset/development.csv"
EVALUATION_FILE = "./dataset/evaluation.csv"

MALE_NAMES_FILE = "./dataset/ITGivenMale.json"
FEMALE_NAMES_FILE = "./dataset/ITGivenFemale.json"

import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
from sklearn.decomposition import TruncatedSVD
import random
import time
import pickle
import string

# Tokenization
from sklearn.feature_extraction.text import TfidfVectorizer
# from stop_words import get_stop_words
from string import punctuation

from nltk.corpus import stopwords

# Plot
from wordcloud import WordCloud

# Spacy
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lemmatizer import Lemmatizer
spacy.prefer_gpu()
nlp = spacy.load('it_core_news_sm', disable=["tagger", "parser", "ner"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.Defaults.create_tokenizer(nlp)

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
"""Print elapsed time"""
def printElapsedTime(starting_time, task = ''):
    m, s = divmod(time.time() - starting_time, 60)
    task = ' - task: ' + task if task != '' else ''
    print(f'Elapsed: {int(m):02d}:{int(s):02d}' + task)

def getFeatures(data, tfidf, features = None, verbose=True):
    
    """Feature selection"""
    start_time = time.time()
    # Dimensionality reduction using truncated SVD
    if DIM_REDUCTION:
        if DIM_REDUCTION == 'SVD':
            svd = TruncatedSVD(n_components = 10000, n_iter = 10, random_state = 42)
            tfidf = svd.fit_transform(tfidf)
        elif DIM_REDUCTION == 'PCA':
            from sklearn.decomposition import SparsePCA
            pca = SparsePCA(n_components = 5000)
            tfidf = pca.fit_transform(tfidf.toarray())
    else:
        tfidf = tfidf.toarray()
    
    if verbose:
        printElapsedTime(start_time, 'dim reduction: ' + str(DIM_REDUCTION))
    
    start_time = time.time()
    X = pd.DataFrame(tfidf)
    if verbose:
        printElapsedTime(start_time, 'generate DataFrame')

    start_time = time.time()
    
    """Text length"""
    if TEXT_LENGTH:
        X['text_len'] = features['text_len']
    
    """Sentences count"""
    if SENTENCES_CNT:
        X['sentences_cnt'] = features['sentences_cnt']
    
    """Person name"""
    start_time = time.time() ## TIME
    # Usually, on good reviews (and when they feel at home), people tend to write name of the staff
    # Despite this initial idea, names tend to be more frequent on negative reviews
    if FIRST_NAMES_CNT:
        column = 'names_cnt'
        X[column] = np.nan
        column_loc = X.columns.get_loc(column)
        
        for i, text in enumerate(data['text']):
            names_cnt = 0
            for name in italian_firstnames:
                if name in text:
                    names_cnt += 1
            X.iloc[i, column_loc] = names_cnt
    
    if verbose:
        printElapsedTime(start_time, 'person names')
    
    """Number of exclamation marks"""
    if EXCLAMATION_MARKS_CNT:
        X['exclam_mark_cnt'] = features['exclam_mark_cnt']
    
    """Number of question marks"""
    if QUESTION_MARKS_CNT:
        X['question_mark_cnt'] = features['question_mark_cnt']
    
    """Punctuation cnt"""
    if PUNCTUATION_CNT:
        X['punctuation_cnt'] = features['punctuation_cnt']
    
    """Number of words types"""
    start_time = time.time() ## TIME
    if WORD_TYPES_CNT:
        word_types = ['PART', 'SCONJ', 'DET', 'PUNCT', 'INTJ', 'ADV', 'SYM', 'NOUN', 'PROPN', 'NUM', 'ADP', 'X', 'VERB', 'AUX', 'CONJ', 'SPACE', 'ADJ', 'PRON']

        for word_type in word_types:
            X['cnt_' + word_type] = 0

        for i, text in enumerate(data['text']):
            word_types_dict = {}
            for word_type in word_types:
                word_types_dict[word_type] = 0

            doc = getNlpFromText(text)
            tokens_number = 0
            for token in doc:
                word_types_dict[token.pos_] += 1
                tokens_number += 1

            for key, value in word_types_dict.items():
                X.iloc[i, X.columns.get_loc('cnt_' + key)] = value / tokens_number
    
    if verbose:
        printElapsedTime(start_time, 'number of word types')
    
    return X

In [None]:
def lemmatizer(text):
    for word, initial in replace_before_lemmatization.items():
        text = text.replace(word, initial)
        
    nlp_doc = getNlpFromText(text)
    tokens = nlp_doc
    tokens_withoud_punct = [str(token.lemma_).translate(str.maketrans('', '', string.punctuation+' ')) for token in tokens]
    return tokens_withoud_punct

def tokenizer(text):
    tokens = spacy_tokenizer(text)
    return list(map(str, tokens))

WORD = re.compile(r'\w+')
def regTokenizer(text):
#     text = text.translate(str.maketrans('', '', string.punctuation))
    words = WORD.findall(text)
    return words

# Restore RAM

In [None]:
try:
    nlp_documents
    print("nlp_documents is already initialized")
except NameError:
    try:
        start_time = time.time()
        with open('./cache/nlp_documents.p', 'rb') as fp:
            nlp_documents = pickle.load(fp)
        printElapsedTime(start_time, 'load nlp documents')
    except:
        nlp_documents = {}
        print("nlp_documents.p not found")

def getNlpFromText(text):
    if text in nlp_documents:
        return nlp_documents[text]
    else:
        doc = nlp(text)
        nlp_documents[text] = doc
        return doc

# Data

### Italian names

In [None]:
import json

# Load Italian names
with open(MALE_NAMES_FILE, 'r') as f:
    data_male_names = json.load(f)
with open(FEMALE_NAMES_FILE, 'r') as f:
    data_female_names = json.load(f)
male_names = list(map(lambda x: x['name'].lower(), data_male_names))
female_names = list(map(lambda x: x['name'].lower(), data_female_names))
italian_firstnames = male_names + female_names

### Reviews

In [None]:
"""Given data"""
development_data = pd.read_csv(DEVELOPMENT_FILE)
evaluation_data = pd.read_csv(EVALUATION_FILE)

"""External data"""
# Use data from external datasets
USE_EXTERNAL_DATA = True

# If USE_EXTERNAL_DATA, the original proportion between positive and negative has to be maintained?
MAINTAIN_PROPORTION = True

# 50% pos, 50% neg
BALANCE_REVIEWS_PERFECTLY = False

if USE_EXTERNAL_DATA:
    external_data = pd.read_csv('./dataset/external_reviews.csv') # 50% pos, 50% neg
    dev_pos_prop = development_data[development_data['class'] == 'pos'].shape[0] / development_data.shape[0]
    dev_neg_prop = development_data[development_data['class'] == 'neg'].shape[0] / development_data.shape[0]
    
    external_pos = external_data[(external_data['label'] == 50) | (external_data['label'] == 40)]
    external_pos = external_pos.drop(columns=['label'])
    external_pos = external_pos.rename(columns={'review':'text'})
    external_pos['class'] = 'pos'
    
    external_neg = external_data[(external_data['label'] == 10) | (external_data['label'] == 20)]
    external_neg = external_neg.drop(columns=['label'])
    external_neg = external_neg.rename(columns={'review':'text'})
    external_neg['class'] = 'neg'
    
    # I should only add negative reviews such that the proportion is maintained
    if MAINTAIN_PROPORTION:
        neg_to_add = int((dev_neg_prop / dev_pos_prop) * len(external_neg))
    else:
        neg_to_add = len(external_neg)
    
    development_data = pd.concat([development_data, external_neg.sample(n=neg_to_add), external_pos],ignore_index=True).sample(frac=1)

if BALANCE_REVIEWS_PERFECTLY:
    neg_reviews = development_data[development_data['class'] == 'neg'].sample(frac=1)
    pos_reviews = development_data[development_data['class'] == 'pos'].sample(frac=1)
    
    min_len = min(neg_reviews.shape[0], pos_reviews.shape[0])
    neg_reviews = neg_reviews[:min_len]
    pos_reviews = pos_reviews[:min_len]
    
    development_data = pd.concat([neg_reviews,pos_reviews],ignore_index=True).sample(frac=1)

In [None]:
features_to_normalize = ['text_len', 'exclam_mark_cnt', 'question_mark_cnt', 'punctuation_cnt']
features = {
    'development': {
        feature: [] for feature in features_to_normalize
    },
    'evaluation': {
        feature: [] for feature in features_to_normalize
    }
}


start_time = time.time()
for text in development_data['text']:
    features['development']['text_len'].append(len(text))
    features['development']['exclam_mark_cnt'].append(text.count("!"))
    features['development']['question_mark_cnt'].append(text.count("?"))
    features['development']['punctuation_cnt'].append(len([char for char in text if char in punctuation]))
    features['development']['sentences_cnt'].append(len(list(getNlpFromText(text).sents)))
printElapsedTime(start_time, "development")

start_time = time.time()
for text in evaluation_data['text']:
    features['evaluation']['text_len'].append(len(text))
    features['evaluation']['exclam_mark_cnt'].append(text.count("!"))
    features['evaluation']['question_mark_cnt'].append(text.count("?"))
    features['evaluation']['punctuation_cnt'].append(len([char for char in text if char in punctuation]))
    features['evaluation']['sentences_cnt'].append(len(list(getNlpFromText(text).sents)))
printElapsedTime(start_time, "development")

# Normalize values
start_time = time.time()
for feature in features_to_normalize:
    values = np.array(features['development'][feature] + features['evaluation'][feature])
    z = (values - values.mean())/(values.std())
    
    features['development'][feature] = list(z[:len(features['development'][feature])])
    features['evaluation'][feature] = list(z[len(features['development'][feature]):])
printElapsedTime(start_time, 'normalization')

### Stop words

In [None]:
# stop_words = get_stop_words('it')
stop_words = stopwords.words('italian')
words_to_remove_from_stopwords = ['non', 'più', 'sei', 'no'] #['ci', 'contro', 'ma', 'però', 'poco', 'pochi', 'poche', 'poca', 'senza', 'non', 'no', 'più', 'quasi', 'feci']#, 'essere', 'avere', 'sarei', 'saremmo', 'avrei', 'avremmo', 'fossi', 'avessi']
words_to_add_to_stopwords = ['essere', 'avere'] #+ list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~…“”°')
replace_before_lemmatization = {
    '’': "'",
    'wi fi': 'wifi',
    'wi - fi': 'wifi',
    'wi-fi': 'wifi',
    "week-end": "weekend",
    'week end': 'weekend',
    'check in': 'checkin',
    'check out': 'checkout',
    'check-in': 'checkin',
    'check-out': 'checkout',
    'check - in': 'checkin',
    'check - out': 'checkout',
    '\n': ' ',
    "a'": 'à',
    "e'": 'è',
    "é": 'è',
    "i'": 'ì',
    "o'": 'ò',
    "u'": 'ù',
    
    " €": "€",
    " euro": "€",
    "euro": "€",
}

# Add words
stop_words = [word for word in stop_words if word not in words_to_remove_from_stopwords]
stop_words += words_to_add_to_stopwords
stop_words = list(set(stop_words))

# Vectorization

In [None]:
vectorizer = TfidfVectorizer(
#     strip_accents = None,
#     lowercase = True,
#     preprocessor = None,
    tokenizer = lemmatizer,
#     analyzer = 'word', # callable - extract the sequence of features out of the raw, unprocessed input
    stop_words = stop_words,
    ngram_range = (1, 3),
    max_df = 0.45,
    min_df = 2,
    max_features = 35000, #best: 40000,
#     norm = 'l2', # l2
#     use_idf = True,
#     sublinear_tf = False,
)
start_time = time.time()
tfidf_X = vectorizer.fit_transform(development_data['text'])
printElapsedTime(start_time, 'fit-transform')

# Plot vectorization results

In [None]:
terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = tfidf_X.sum(axis=0)

# connecting term to its sums frequency
data_to_plot = []
for col, term in enumerate(terms):
    data_to_plot.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data_to_plot, columns=['term','rank']).sort_values('rank', ascending=False)

ranking.head(100)

In [None]:
wc_freq = {term: rank for term, rank in zip(ranking['term'], ranking['rank'])}
wordcloud = WordCloud().generate_from_frequencies(wc_freq)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Features

In [None]:
DIM_REDUCTION = False
TEXT_LENGTH = False#True
FIRST_NAMES_CNT = False
EXCLAMATION_MARKS_CNT = False#True
QUESTION_MARKS_CNT = False#True
SENTENCES_CNT = False#True
PUNCTUATION_CNT = False#True
WORD_TYPES_CNT = False

In [None]:
from sklearn.preprocessing import label_binarize

start_time = time.time()
X = getFeatures(development_data, tfidf_X, features['development'])
y = development_data['class']
# y = label_binarize(['pos'], classes=y)[0]

printElapsedTime(start_time, "get all features")

# Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
start_time = time.time()
X_norm = scaler.fit_transform(X)
printElapsedTime(start_time)
X = pd.DataFrame(X_norm)
X

# Hyperparameters tuning

In [None]:
from hyperopt import hp, fmin, tpe, space_eval, Trials
TEXT_LENGTH = False

speedy_data = development_data.sample(n=5000)
def objective(args):
    vectorizer = TfidfVectorizer(
        strip_accents = None,
        lowercase = True,
        preprocessor = None,
        tokenizer = lemmatizer,
        analyzer = 'word', # callable - extract the sequence of features out of the raw, unprocessed input
        stop_words = stop_words,
        ngram_range = (1, 5),
        max_df = args['max_df'],
        min_df = args['min_df'],
        max_features = int(args['max_features']),
        norm = 'l2', # l2
        use_idf = True,
        sublinear_tf = False,
    )
    tfidf_X = vectorizer.fit_transform(speedy_data['text'])
    X = getFeatures(speedy_data, tfidf_X, features['development'], verbose=False)
    y = speedy_data['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    clf = sklearn.svm.LinearSVC(
        penalty='l2', # l1, l2
        loss='squared_hinge', # hinge, squared_hinge
        dual=False, # Prefer dual=False when n_samples > n_features
        tol=1e-4,
        C=1.0,
        multi_class='ovr',
        fit_intercept=True,
        intercept_scaling=100,
        class_weight=None,
        verbose=0,
        random_state=42,
        max_iter=1000
    )
    clf.fit(X_train, y_train)
    
    y_test_pred = clf.predict(X_test)

    p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)
    
    f1_neg = f[0]
    f1_pos = f[1]
    
    weighted_avg_f1 = 0.3*f1_neg + 0.7*f1_pos
    
    current_best_str = ''
#     if weighted_avg_f1 > current_best:
#         current_best = weighted_avg_f1
#         current_best_str = "<<<=== CURRENT BEST"
#     print(str(args['max_df']) +"\t" + str(args['min_df']) +"\t" + str(weighted_avg_f1))
    
    return 1-weighted_avg_f1

space = {
    'max_df': hp.uniform('max_df', 0.2, 1),
    'min_df': hp.uniform('min_df', 0, 0.2),
#     'use_idf': hp.choice('use_idf', [True]),
    'max_features': hp.quniform('max_features', 100, 50000, 1),
}

# minimize the objective over the space
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=1000, trials=trials)

print(best)

In [None]:
trials_df = pd_read_csv('trials.csv').drop(columns=['exp_key', 'owner', 'version'])
# trials_df['loss'] = trials_df['result']['trials']
# trials_df.eval('result[\'loss\']')
trials_df['time'] = np.nan
trials_df['max_features'] = np.nan
trials_df['max_df'] = np.nan
trials_df['min_df'] = np.nan
trials_df['loss'] = np.nan
column_time = trials_df.columns.get_loc('time')
column_max_features = trials_df.columns.get_loc('max_features')
column_max_df = trials_df.columns.get_loc('max_df')
column_min_df = trials_df.columns.get_loc('min_df')
column_loss = trials_df.columns.get_loc('loss')
for index, row in trials_df.iterrows():
    trials_df.iloc[index, column_time] = row['refresh_time'] - row['book_time']
    trials_df.iloc[index, column_max_features] = row['misc']['vals']['max_features']
    trials_df.iloc[index, column_max_df] = row['misc']['vals']['max_df']
    trials_df.iloc[index, column_min_df] = row['misc']['vals']['min_df']
    try:
        trials_df.iloc[index, column_loss] = row['result']['loss']
    except:
        trials_df.iloc[index, column_loss] = np.nan
trials_df = trials_df.drop(columns=['misc', 'result', 'spec', 'state', 'book_time', 'refresh_time']).sort_values(by=['loss'], ascending=True).head(20)
trials_df

In [None]:
print(trials_df['max_df'].quantile(q=0.9))
print(trials_df['min_df'].quantile(q=0.5))
print(trials_df['max_features'].quantile(q=0.5))

# Test classifier

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import sklearn

f1_neg = []
f1_pos = []
for i in range(1):
    start_time = time.time()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False)
    printElapsedTime(start_time, 'train_test_split')

    # Neighbors
#     clf = KNeighborsClassifier();
    
    # Naive Bias
#     clf = GaussianNB()
#     clf = MultinomialNB()
#     clf = CategoricalNB()
#     clf = BernoulliNB()
#     clf = ComplementNB()
    
#     clf = RandomForestClassifier(n_estimators = 10)
    
#     clf = MLPClassifier(verbose=True, max_iter=10)
#     clf = SVC(gamma='auto', verbose=1)
    clf = sklearn.svm.LinearSVC(
        penalty='l2', # l1, l2
        loss='squared_hinge', # hinge, squared_hinge
        dual=False, # Prefer dual=False when n_samples > n_features
        tol=1e-4,
        C=0.4,
        multi_class='ovr',
        fit_intercept=True,
#         intercept_scaling=1,
        class_weight='balanced',
        verbose=1,
#         random_state=42,
        max_iter=1000
    )
#     clf = sklearn.linear_model.SGDClassifier()
#     clf = LogisticRegression(solver='liblinear', n_jobs=-1)
    
    start_time = time.time()
    clf.fit(X_train, y_train)
    printElapsedTime(start_time, 'fit classifier')
    
    y_test_pred = clf.predict(X_test)

    p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)
    
    f1_neg.append(f[0])
    f1_pos.append(f[1])
    print(f[0], f[1])
print("Mean F1:", np.array(f1_neg).mean(), np.array(f1_pos).mean())

In [None]:
# clf.intercept_

In [None]:
from sklearn.metrics import roc_curve, auc
"""COMPUTE ROC CURVE"""
fpr, tpr, threshold = roc_curve(y_true = y_test, y_score = y_test_pred_proba[:, 1], pos_label = 'pos')
roc_auc = auc(fpr, tpr)

# Compute micro-average ROC curve and ROC area
# fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
# roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

"""PLOT ROC CURVE"""
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

"""Youden's J statistic"""
J = tpr - fpr
best_threshold = threshold[J.argmax()]

y_pred_from_treshold = y_test_pred_proba[:, 1] > best_threshold
y_pred_from_treshold = list(map(lambda x: 'pos' if x == True else 'neg', y_pred_from_treshold))

p, r, f, s = precision_recall_fscore_support(y_test, y_pred_from_treshold)
f1_neg.append(f[0])
f1_pos.append(f[1])
print(f[0], f[1])

In [None]:
wrong_X = []
for i in range(len(X_test)):
    if y_test.iloc[i] != y_test_pred[i]:
        wrong_X.append({
            'iteration': i,
            'id': X_test.index[i],
            'y_true': y_test.iloc[i],
            'y_pred': y_test_pred[i]
        })
wrong_X[4]

In [None]:
wrong_X_curr = wrong_X[9]
# wrong_X_curr = {'id': 23858, 'y_true': 'neg', 'y_pred': 'pos'}


print("TRUE:", wrong_X_curr['y_true'], "\t\tPREDICTED:", wrong_X_curr['y_pred'])
development_data.iloc[wrong_X_curr['id']]['text']

### Features importances

In [None]:
features_weights = pd.DataFrame()
features_weights['names'] = vectorizer.get_feature_names()
features_weights['weights'] = clf.coef_[0]
features_weights['abs_weights'] = abs(clf.coef_[0])
features_weights.sort_values(by=['abs_weights'], ascending=False).tail(20)

In [None]:
from matplotlib import pyplot as plt

def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

f_importances(clf.coef_[0], np.array(vectorizer.get_feature_names() + ['text_len']))

In [None]:
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

svc_disp = plot_roc_curve(svc, X_test, y_test)
plt.show()

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
svc_disp.plot(ax=ax, alpha=0.8)
plt.show()

# Evaluation

In [None]:
# Train classifier
clf = MLPClassifier(verbose=True)
start_time = time.time()
clf.fit(X, y)
printElapsedTime(start_time)

In [None]:
# Evaluate unknown data
start_time = time.time()
eval_tfidf = vectorizer.transform(evaluation_data['text'])
printElapsedTime(start_time)

In [None]:
eval_X = getFeatures(evaluation_data, eval_tfidf, features['evaluation'])

In [None]:
eval_y = clf.predict(eval_X)

# Save solution on file
with open("solution.csv", "w+") as f:
    f.seek(0)
    f.write("Id,Predicted\n")
    for i, label in enumerate(eval_y):
        f.write(f"{i},{label}\n")

# Save RAM

In [None]:
SAVE_RAM = True

if SAVE_RAM:
    start_time = time.time()
    with open('./cache/nlp_documents.p', 'wb') as fp:
        pickle.dump(nlp_documents, fp, protocol=pickle.HIGHEST_PROTOCOL)
    printElapsedTime(start_time)
else:
    print("NOT SAVING!!")

EOF