# Fine Food Review

Assignment for Swisscom interview<br>
Ludovic Herbelin, 2020

**Goals : Predict review score from the text of the review.**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm.notebook import tqdm
import sklearn

## Loading data

In [2]:
DATASET_PATH = 'Reviews.csv'

dataset = pd.read_csv(DATASET_PATH, nrows=10000)

print(len(dataset))

10000


In [3]:
df = dataset.copy()
df = df.drop(columns=['ProductId', 'UserId', 'ProfileName', 'Time'])
text_columns = ['Text', 'Summary']

df = df.drop(columns = ['Summary'])
text_columns = ['Text']
df.head()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Text
0,1,1,1,5,I have bought several of the Vitality canned d...
1,2,0,0,1,Product arrived labeled as Jumbo Salted Peanut...
2,3,1,1,4,This is a confection that has been around a fe...
3,4,3,3,2,If you are looking for the secret ingredient i...
4,5,0,0,5,Great taffy at a great price. There was a wid...


In [4]:
for column_name in text_columns:
    df[column_name].fillna(" ", inplace=True)

print(f"Any null values left : {len(df[df.isna().any(axis=1)]) > 0}")

Any null values left : False


In [5]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
   
ps = PorterStemmer() 
stop_words = set(stopwords.words('english'))  

def __stem_word__(word):
    return ps.stem(word)

def __tokenize_text__(text):
    return word_tokenize(text)

def preproces_text(text, stopwords):
    words = __tokenize_text__(text)
    # set words to lowercase and remove punctuation
    words = [word.lower() for word in words if word.isalpha()]
    words = [__stem_word__(word) for word in words]
    words = list(filter(lambda w: w not in stopwords, words))
    
    return words


print(preproces_text(df['Text'][0], stopwords=stop_words))

['bought', 'sever', 'vital', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'thi', 'product', 'better']


### Dataset preprocessing

In [6]:
X = df.loc[:, df.columns != 'Score']
Y = df['Score']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_text_ds(X, text_columns):
    X_processed = X.copy()
    tfidf_vectorizer = TfidfVectorizer()
    for column_name in tqdm(text_columns):
        tfidf_dense = tfidf_vectorizer.fit_transform(X_processed[column_name]).todense()
        new_cols = tfidf_vectorizer.get_feature_names()

        # remove the text column as the column's label (EG 'text') may exist in the words and there would be an error
        X_processed = X_processed.drop(column_name,axis=1)
        X_processed = X_processed.join(pd.DataFrame(tfidf_dense, columns=new_cols))

    
    X_processed.fillna(0, inplace=True)
    return X_processed

In [8]:
from sklearn.model_selection import train_test_split
TRAIN_RATIO = 0.8

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y, train_size=TRAIN_RATIO, random_state=42)

print(f"Train size : {len(Y_train)}, test set size : {len(Y_test)}")

Train size : 8000, test set size : 2000


In [9]:
X_train = preprocess_text_ds(X_train, text_columns=text_columns)
X_test = preprocess_text_ds(X_test, text_columns=text_columns)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




### Model

In [10]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression(n_jobs=8)
reg.fit(X_train, Y_train)

LinearRegression(n_jobs=8)

In [19]:
from sklearn.metrics import mean_squared_error, r2_score
def test_compute_metrics(model, X, Y):
    print(X.shape)
    print(Y.shape)
    y_pred = model.predict(X)
    r2 = r2_score(Y, y_pred)
    mse = mean_squared_error(Y, y_pred)
    
    return {'r2':r2, 'mse':mse}

In [20]:
train_metrics = test_compute_metrics(reg, X_train, Y_train)

print(train_metrics)

(8000, 16782)
(8000,)
{'r2': 0.8018175285191979, 'mse': 0.34914339854805454}


### Test set

In [21]:
print(X_test.shape)
test_metrics = test_compute_metrics(reg, X_test, Y_test)
print(test_metrics)

(2000, 9261)
(2000, 9261)
(2000,)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 16782 is different from 9261)

### Plot metrics

In [None]:
sns.set_theme()

def plot_bar(results_dict, title, ylabel):
    plt.bar(results_dict.keys(), results_dict.values())
    plt.ylabel(ylabel)
    plt.title(title)
    plt.xticks(rotation=30)
    plt.ylim((0,1))
    plt.show()

results_dict = {'LinearReg Train':train_acc, 'LinearReg Test':test_acc}
plot_bar(results_dict, "Accuracies plot for score prediction", "Accuracy")