# Loading data by using pandas dataframe

In [None]:
import os
folder = os.getcwd()+'/'
folderData = 'datasets/'
fileDev = 'development.csv'
fileEval = 'evaluation.csv'

import numpy as np

import matplotlib.pyplot as plt
from nltk.corpus import stopwords as sw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

import pandas as pd

reviews_dev_df = pd.read_csv(folder + folderData + fileDev)
reviews_eval_df = pd.read_csv(folder + folderData + fileEval)


# DATA CLEANING - Text tokenization 

In [None]:
import string
import re
from nltk.corpus import stopwords
italian_stopwords = stopwords.words('italian')

def listToString(lista):
    s = ''
    for word in lista:
        s += word + ' '
    
    return s

def remove_punctuation(text):
    for p in (string.punctuation + "'" +"\n"):
        text = text.replace(p,' ')
        
    
    return text

def extract_tokens(text,italian_stopwords):
    clean_text=[]
    other_stopwords = italian_stopwords +\
                    ['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    ['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    ['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['stella','stelle','ristorante'] +\
                    ['venezia','san','milano','napoli'] +\
                    ['per','su','tra','fra','quando','quindi','così','solo'] 
    for w in text.split(' '):
        if ( len(w)<=2 )  or (bool(re.search(r'\d', w)) == True) :
            continue
        w=w.lower()
        if w not in other_stopwords:
            clean_text.append(w)
    return listToString(clean_text)

def tokenize(text,stopwords):
    text=remove_punctuation(text)
    tokens = extract_tokens(text,stopwords)
    return tokens
    
reviews_dev_df['tokens'] = reviews_dev_df['text'].apply(lambda doc : tokenize(doc,italian_stopwords))
reviews_eval_df['tokens'] = reviews_eval_df['text'].apply(lambda doc : tokenize(doc,italian_stopwords))

# DATA EXPLORATION - Word cloud generation

In [None]:
from wordcloud import WordCloud

def get_positive_reviews(reviews_dev_df):
    positives = reviews_dev_df[reviews_dev_df['class']=='pos']['tokens']
    return listToString(positives)

def get_negative_reviews(reviews_dev_df):
    negatives = reviews_dev_df[reviews_dev_df['class']=='neg']['tokens']
    return listToString(negatives)
    

wordcloud_pos=WordCloud(background_color='white',width=1200,height=1000).generate_from_text(get_positive_reviews(reviews_dev_df))
wordcloud_neg=WordCloud(background_color='white',width=1200,height=1000).generate_from_text(get_negative_reviews(reviews_dev_df))

#PLOTS 
%matplotlib notebook
print("Positives :\n",wordcloud_pos.words_)
print("Negatives :\n",wordcloud_neg.words_)


plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.figure()
plt.imshow(wordcloud_neg, interpolation="bilinear")
#sns.distplot(pos_lengths)
#sns.distplot(neg_lengths)

# PREPROCESSING - Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('italian')

def stem(tokens):
    stemmed_tokens=[]
    for x in tokens.split(' '):
        stemmed_tokens.append(stemmer.stem(x))
    
    return listToString(stemmed_tokens)
        
    
reviews_dev_df['stemmed_tokens'] = reviews_dev_df['tokens'].apply(lambda token : stem(token) )
reviews_eval_df['stemmed_tokens'] = reviews_eval_df['tokens'].apply(lambda token : stem(token))
reviews_dev_df['stemmed_tokens']



# PREPROCESSING - TF-IDF

In [193]:
%matplotlib notebook
# 3143 components - min_dv = 30 max_df=5 
reviews_vectorizer = TfidfVectorizer(min_df=1,max_df=0.7,ngram_range=(1,2),norm='l2',sublinear_tf=True)
reviews_vectorizer.fit(reviews_dev_df['stemmed_tokens'])
tfidf_reviews_dev = reviews_vectorizer.transform(reviews_dev_df['stemmed_tokens'])
tfidf_reviews_eval =reviews_vectorizer.transform(reviews_eval_df['stemmed_tokens'])
print(tfidf_reviews_dev.shape)




(28754, 707614)


# PREPROCESSING - Singular value decomposition analysys

In [197]:
svd_vectorizer = TruncatedSVD(n_components=2,random_state=41)
svd_vectorizer.fit(tfidf_reviews_dev)
tfidf_svd_dev = svd_vectorizer.transform(tfidf_reviews_dev)
tfidf_svd_eval = svd_vectorizer.transform(tfidf_reviews_eval)

MemoryError: Unable to allocate array with shape (707614, 12) and data type float64

In [183]:
print(np.argmax(np.cumsum(svd_vectorizer.explained_variance_ratio_) > 0.0315))

29


# PREPROCESSING - Singular value decomposition - Visualization

In [None]:
svd = TruncatedSVD(n_components=2,random_state=41)
svd.fit(tfidf_reviews_dev)

tfidf_svd = svd.transform(tfidf_reviews_dev)

reviews_dev_df['num_class']=reviews_dev_df['class'].apply(lambda x : 1 if x == 'pos' else 0)

# Data for a three-dimensional line
fig,ax=plt.subplots()
ax.scatter(tfidf_svd[:,0],tfidf_svd[:,1] ,c=reviews_dev_df['num_class'],s=0.1)

# TUNING - Linear SVC ( computed on TF-IDF )

In [198]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,confusion_matrix
import seaborn as sns
def gridSearchCV_SVC(Reviews_train,reviews_labels):
    classifier = LinearSVC(fit_intercept=True,tol=1e-5)
    param_grid = { 
                  'C':[1.5,2.0,2.5],
                  'penalty' : ['l2'],
                  'loss' : ['hinge','squared_hinge'],
                  'class_weight' : ['balanced',None]
                  
                   
                   }
    
    gridsearch = GridSearchCV(classifier, param_grid, scoring='f1_weighted', cv=5)
    gridsearch.fit(Reviews_train,reviews_labels)
    return gridsearch
    
X_train,X_test,y_train,y_test= train_test_split(tfidf_reviews_dev,reviews_dev_df['class'],train_size=0.85,stratify=reviews_dev_df['class'],shuffle=True)
gridsearchCV_SVC_=gridSearchCV_SVC(X_train,y_train)

y_pred=gridsearchCV_SVC_.best_estimator_.predict(X_test)
print('Best estimator return by a GridSearchCV applied to a Linear SVC:\n')
print('\t-best_parameters:',gridsearchCV_SVC_.best_estimator_,'\n')
print('\t-best_score:',gridsearchCV_SVC_.best_score_,'\n')
print('\t-f1_score computed on evaluation dataset:',f1_score(y_pred,y_test,average = 'weighted'),'\n')

print('Classification report:\n')
print(classification_report(y_pred,y_test))

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

MemoryError: Unable to allocate array with shape (2521387,) and data type int32

# TUNING - Linear SVC ( computed on TF-IDF after singular value decomposition analysis)

In [185]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
def gridSearchCV_SVC_svd(Reviews_train,reviews_labels):
    classifier = LinearSVC(fit_intercept=True,random_state=42,tol=1e-5)
    param_grid = { 
                  'C':[1.5,2.0,2.5],
                  'penalty' : ['l2'],
                  'loss' : ['hinge','squared_hinge'],
                  'class_weight' : ['balanced',None]
                  
                   
                   }
    
    gridsearch = GridSearchCV(classifier, param_grid, scoring='f1_weighted', cv=5)
    gridsearch.fit(Reviews_train,reviews_labels)
    return gridsearch
    
Reviews_train,Reviews_test,reviews_labels,test_labels= train_test_split(tfidf_svd_dev,reviews_dev_df['class'],train_size=0.85,stratify=reviews_dev_df['class'],shuffle=True)
gridsearchCV_SVC_svd_=gridSearchCV_SVC_svd(Reviews_train,reviews_labels)

y_pred=gridsearchCV_SVC_svd_.best_estimator_.predict(Reviews_test)
print(gridsearchCV_SVC_svd_.best_estimator_)
print(gridsearchCV_SVC_svd_.best_score_)
print(f1_score(y_pred,test_labels,average = 'weighted'))
print(classification_report(y_pred,test_labels))



LinearSVC(C=2.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=1e-05,
          verbose=0)
0.9444919336998773
0.9455548569479536
              precision    recall  f1-score   support

         neg       0.90      0.93      0.91      1336
         pos       0.97      0.95      0.96      2978

    accuracy                           0.95      4314
   macro avg       0.93      0.94      0.94      4314
weighted avg       0.95      0.95      0.95      4314



# VALIDATION - Linear SVC ( computed on TF-IDF)

In [186]:
best_estimator = gridsearchCV_SVC_.best_estimator_
y_pred=best_estimator.predict(tfidf_reviews_eval)
print(np.unique(y_pred,return_counts=True))

with open(folder + 'res_tfidf_SVC.csv', 'w') as file:
    file.write('Id,Predicted\n')
    for i,l in enumerate(y_pred):
        file.write(str(i) + ',' + l +'\n')
    file.close()

(array(['neg', 'pos'], dtype=object), array([3929, 8394], dtype=int64))


# VALIDATION - Linear SVC ( computed on TF-IDF after singular value decomposition)

In [189]:
best_estimator = gridsearchCV_SVC_svd_.best_estimator_
y_pred=best_estimator.predict(tfidf_svd_eval)
print(np.unique(y_pred,return_counts=True))

with open(folder + 'res_tfidf_SVC_svd.csv', 'w') as file:
    file.write('Id,Predicted\n')
    for i,l in enumerate(y_pred):
        file.write(str(i) + ',' + l +'\n')
    file.close()

(array(['neg', 'pos'], dtype=object), array([3838, 8485], dtype=int64))


# Results

In [None]:
#SVD:
#1.    # min_df:30    max_df:0.7    score:0.965    y_pred: array([3896, 8427] n_components=4000
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\
                    
#2.    # min_df:10    max_df:0.7    score:0.967    y_pred: array([3892, 8431] n_components=3000
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\

#3.    # min_df:3    max_df:0.7    score:null    y_pred: array([3838, 8485] n_components=30
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\
                    
#4.    # min_df:1    max_df:0.7    score:null    y_pred: array([3838, 8485] n_components=30
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\

                
                
#TF-IDF:
#1.    # min_df:30    max_df:0.7    score:0.967    y_pred: array([3908, 8415] 
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così']
                
#2.    # min_df:10    max_df:0.7    score: null    y_pred: array([3981, 8342] 
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così']

#3.    # min_df:3    max_df:0.7    score:0.967    y_pred: array([3929, 8394]
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\
                    
#4.    # min_df:1    max_df:0.7    score:null    y_pred: array([3838, 8485] n_components=30
       # stop_words:['stare','trovare','soggiornare','volere','fare','stare','dire','potere'] +\
                    #['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    #['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    #['per','su','tra','fra','quando','quindi','così'] 
                    #['venezia','san','milano','napoli'] +\
