# Loading data by using pandas dataframe

In [None]:
%matplotlib notebook
import os
folder = os.getcwd()+'/'
folderData = 'datasets/'
fileDev = 'development.csv'
fileEval = 'evaluation.csv'

import numpy as np

import matplotlib.pyplot as plt
from nltk.corpus import stopwords as sw
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

import pandas as pd

reviews_dev_df = pd.read_csv(folder + folderData + fileDev)
reviews_eval_df = pd.read_csv(folder + folderData + fileEval)

# DATA EXPLORATION 

# Handling missing values

In [None]:
reviews_dev_df.dropna()
print(reviews_dev_df.shape)

# Outliers Detection

In [None]:
from langdetect import detect
docs=[]
indices=[]
for ind,doc in enumerate(reviews_dev_df['text']):
    if detect(doc) != 'it':
        docs.append(doc)
        indices.append(ind)
        
for i,d in zip(indices,docs):
    print('Review:\n',d,'\nIndex:',i,'\tClass:',reviews_dev_df.loc[i,'class'],'\n\n\n')
    
reviews_dev_df.drop(index=indices)

# Stastics about length of reviews

In [None]:
import seaborn as sns
%matplotlib notebook

reviews_dev_df['length']=reviews_dev_df['text'].apply( lambda x : len(x))
reviews_eval_df['length']=reviews_eval_df['text'].apply( lambda x : len(x))

sns.distplot(reviews_dev_df[reviews_dev_df['class']=='pos']['length'],axlabel='Length of positive reviews')
sns.distplot(reviews_dev_df[reviews_dev_df['class']=='neg']['length'],axlabel='Length of negative reviews')

print('Percentage of pos reviews :',reviews_dev_df[reviews_dev_df['class']=='pos'].shape[0]/reviews_dev_df.shape[0]*100)
print('Percentage of pos reviews :',reviews_dev_df[reviews_dev_df['class']=='neg'].shape[0]/reviews_dev_df.shape[0]*100,'\n')
print('Length of pos reviews (mean) :',int(reviews_dev_df[reviews_dev_df['class']=='pos']['length'].mean()))
print('Length of neg reviews (mean) :',int(reviews_dev_df[reviews_dev_df['class']=='neg']['length'].mean()),'\n')
print('Minimum length of pos reviews:',int(reviews_dev_df[reviews_dev_df['class']=='pos']['length'].min()))
print('Minimum length of neg reviews:',int(reviews_dev_df[reviews_dev_df['class']=='neg']['length'].min()),'\n')
print('Length of pos reviews (median) :',int(reviews_dev_df[reviews_dev_df['class']=='pos']['length'].quantile(0.5)))
print('Length of neg reviews (median) :',int(reviews_dev_df[reviews_dev_df['class']=='neg']['length'].quantile(0.5)),'\n')
print('Length of pos reviews (95% quantile) :',int(reviews_dev_df[reviews_dev_df['class']=='pos']['length'].quantile(0.95)))
print('Length of neg reviews (95% quantile) :',int(reviews_dev_df[reviews_dev_df['class']=='neg']['length'].quantile(0.95)),'\n')
print('Length of pos reviews (5% quantile) :',int(reviews_dev_df[reviews_dev_df['class']=='pos']['length'].quantile(0.05)))
print('Length of neg reviews (5% quantile) :',int(reviews_dev_df[reviews_dev_df['class']=='neg']['length'].quantile(0.05)))




# PREPROCESSING

# Tokenization

In [None]:
import string
import re
from nltk.corpus import stopwords
italian_stopwords = stopwords.words('italian')
italian_stopwords.remove('non')
def listToString(lista):
    s = ''
    for word in lista:
        s += word + ' '
    
    return s

def remove_punctuation(text):
    for p in (string.punctuation + "'" +"\n"):
        text = text.replace(p,' ')
        
    
    return text

def extract_tokens(text,italian_stopwords):
    clean_text=[]
    other_stopwords = italian_stopwords +\
                    ['stare','trovare','volere','fare','stare','dire','potere'] +\
                    ['stato','stati','stata','dato','fatto','chiesto','visto','trovato','detto'] +\
                    ['colazione','camera','camere','hotel', 'albergo','stanza','struttura','reception'] +\
                    ['per','su','tra','fra','quando','quindi','così'] +\
                    ['venezia','milano','napoli'] +\
                    ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre'] +\
                    ['lunedì','martedì','mercoledì','giovedì','venerdì','sabato','domenica']
    for w in text.split(' '):
        if ( len(w)<=2 )  or (bool(re.search(r'\d', w)) == True) :
            continue
        w=w.lower()
        if w not in other_stopwords:
            clean_text.append(w)
    return listToString(clean_text)

def tokenize(text,stopwords):
    text=remove_punctuation(text)
    tokens = extract_tokens(text,stopwords)
    return tokens
    
reviews_dev_df['tokens'] = reviews_dev_df['text'].apply(lambda doc : tokenize(doc,italian_stopwords))
reviews_eval_df['tokens'] = reviews_eval_df['text'].apply(lambda doc : tokenize(doc,italian_stopwords))
reviews_dev_df['num_of_words']=reviews_dev_df['tokens'].apply( lambda x : len(x.split(' ')))
reviews_eval_df['num_of_words']=reviews_eval_df['tokens'].apply( lambda x : len(x.split(' ')))



# Stemming

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('italian')

def stem(tokens):
    stemmed_tokens=[]
    for x in tokens.split(' '):
        stemmed_tokens.append(stemmer.stem(x))
    
    return listToString(stemmed_tokens)
        
    
reviews_dev_df['stemmed_tokens'] = reviews_dev_df['tokens'].apply(lambda token : stem(token) )
reviews_eval_df['stemmed_tokens'] = reviews_eval_df['tokens'].apply(lambda token : stem(token))

# TF-IDF

In [None]:
reviews_vectorizer = TfidfVectorizer(min_df = 5,ngram_range=(1,2),norm='l2')
reviews_vectorizer.fit(reviews_dev_df['stemmed_tokens'],list(reviews_dev_df['class']))
tfidf_reviews_dev = reviews_vectorizer.transform(reviews_dev_df['stemmed_tokens'])
tfidf_reviews_eval =reviews_vectorizer.transform(reviews_eval_df['stemmed_tokens'])

# Tomek-links

In [None]:
from imblearn.under_sampling import TomekLinks,AllKNN

tm = TomekLinks(sampling_strategy='all')
tfidf_reviews_dev_us,labels_us = tm.fit_resample(tfidf_reviews_dev,reviews_dev_df['class'])

# Linear SVC ( computed on TF-IDF )

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
def gridSearchCV_SVC_US(Reviews_train,reviews_labels):
    classifier = LinearSVC(fit_intercept=True,loss='squared_hinge',tol=1e-6,class_weight='balanced')
    param_grid = { 
                  'C' :[1.0,2.0,3.0]
                   }
    
    gridsearch = GridSearchCV(classifier, param_grid, scoring='f1_weighted', cv=10)
    gridsearch.fit(Reviews_train,reviews_labels)
    return gridsearch
    
Reviews_train,Reviews_test,reviews_labels,test_labels= train_test_split(tfidf_reviews_dev_us,labels_us,train_size=0.85,stratify=labels_us,shuffle=True)
gridsearchCV_SVC_US_=gridSearchCV_SVC_US(Reviews_train,reviews_labels)

y_pred=gridsearchCV_SVC_US_.best_estimator_.predict(Reviews_test)
print(gridsearchCV_SVC_US_.best_estimator_)
print(gridsearchCV_SVC_US_.best_score_)
print(classification_report(y_pred,test_labels))

# Validation

In [None]:
best_estimator = gridsearchCV_SVC_US_.best_estimator_
y_pred=best_estimator.predict(tfidf_reviews_eval)
print(np.unique(y_pred,return_counts=True))

with open(folder + 'res.csv', 'w') as file:
    file.write('Id,Predicted\n')
    for i,l in enumerate(y_pred):
        file.write(str(i) + ',' + l +'\n')
    file.close()

In [None]:
model = best_estimator
feature_to_coef = {
    word: coef for word, coef in zip(
        reviews_vectorizer.get_feature_names(), model.coef_[0]
    )
}



feature_sorted =  sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)

print('Positive features:', end = "\n\n")
for best_positive in feature_sorted[0:10]:
    print (best_positive)
print('')
print('Negative features:', end = "\n\n")
for best_negative in sorted(feature_sorted[-10:]):
    print (best_negative)