# 02 Models

#### Træn & sammenlign fasttext med sklearn på samme dataset og se hvilken model der er mest præcis.

In [7]:
import pandas as pd
import fasttext

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from tabulate import tabulate

#Term Frequency — Inverse Document Frequency
tfidf = TfidfVectorizer()

In [8]:
def calc_accuracy(data):
    if(data > 100):
        data = 100 - data + 100
    else: 
        data = data - 100 + 100
    
    return '{:.2f}'.format(data)

### Train with Sklearn

In [9]:
def sklearn_model(file):
    df = pd.read_csv('./data/' + file + '.csv')

    reviews = df['review']
    ratings = df['rating']

    reviews = tfidf.fit_transform(reviews)

    reviews_train, reviews_test, ratings_train, ratings_test = train_test_split(reviews, ratings, test_size = 0.2, random_state = 5)

    clf = SVC(kernel = 'linear',
              probability = True,
              random_state = 32, 
              decision_function_shape = 'ovr',
              max_iter = 10000)

    clf.fit(reviews_train, ratings_train)
    
    return clf

In [10]:
clf = sklearn_model('combined_csv')

### Train with FastText

In [11]:
model = fasttext.train_supervised(input="./data/combined_csv.csv", epoch=32, lr=0.5, wordNgrams=2, bucket=200000, dim=30, loss='ova')

### Model Comparison

In [12]:
def model_comparison(file, clf):
    df = pd.read_csv('./data/' + file + '.csv')
    
    sklearnList = []
    fasttextList = []
    sk = []
    ft = []
    
    # Predict
    for review in df['review']:
        fasttextList.append(model.predict(review))
        vector = tfidf.transform([review])
        sklearnList.append(clf.predict(vector))
    
    # Sklearn
    for label in model.labels:
        label = (label + ' ')
        sk.append((label, len(list(filter(lambda k: label in k, sklearnList)))))
        
    # FastText
    for label in model.labels:
        fasttextPredicts = []
        for rating in fasttextList:
            rating = str(rating[0]).split("'")
            fasttextPredicts.append(rating[1] + ' ')

        ft.append((label, len(list(filter(lambda k: label in k, fasttextPredicts)))))

    # Table
    table = [['rating ','total reviews ', 'sklearn predict',' sklearn accuracy (%)', 'fastText predict', ' fastText accuracy (%)' ]]
    
    for i, label in enumerate(model.labels):
        rating = label.replace('__label__', '')
        total_reviews = len(df[df['rating'].isin([(label + ' ')])])
        sklearn_predicts = sk[i][1]
        sklearn_accuracy = calc_accuracy(sklearn_predicts / total_reviews * 100)
        fasttext_predicts = ft[i][1]
        fasttext_accuracy = calc_accuracy(fasttext_predicts / total_reviews * 100)
        table.append([rating, total_reviews, sklearn_predicts, sklearn_accuracy, fasttext_predicts, fasttext_accuracy])
        
    return print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
    
model_comparison('combined_csv', clf)

╒═══════════╤══════════════════╤═══════════════════╤═════════════════════════╤════════════════════╤══════════════════════════╕
│   rating  │   total reviews  │   sklearn predict │    sklearn accuracy (%) │   fastText predict │    fastText accuracy (%) │
╞═══════════╪══════════════════╪═══════════════════╪═════════════════════════╪════════════════════╪══════════════════════════╡
│         5 │             5311 │              5866 │                   89.55 │               5331 │                    99.62 │
├───────────┼──────────────────┼───────────────────┼─────────────────────────┼────────────────────┼──────────────────────────┤
│         1 │             2133 │              2536 │                   81.11 │               2132 │                    99.95 │
├───────────┼──────────────────┼───────────────────┼─────────────────────────┼────────────────────┼──────────────────────────┤
│         4 │              855 │               401 │                   46.9  │                837 │            