In [120]:
import pandas as pd
import numpy as np
from joblib import load, dump
from copy import deepcopy
from statistics import mean

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

In [121]:
df = pd.read_csv('data/Features.csv')

In [122]:
df

Unnamed: 0,product,answer_option,label,review_len,Rn,Rp,Rs,Rc,Rd,Rsc
0,Accucheck,Fast and accurate delivery,0,4,0.232859,0.3,0.616667,0.00542,1.0,0.0
1,Accucheck,Expected a longer expire date. Your Product Li...,0,14,1.017469,-0.1,0.4,0.017615,1.0,0.0
2,Accucheck,I liked the prompt service,0,5,0.319747,0.6,0.8,0.006775,1.0,0.4215
3,Accucheck,Good product,0,2,0.545833,0.7,0.6,0.00271,0.0,0.4404
4,Accucheck,I not needed,0,3,0.0,0.0,0.0,0.004065,0.0,0.0
5,Accucheck,Not received the product.,0,4,0.392072,0.0,0.0,0.00542,0.0,0.0
6,Accucheck,The chip is not given that I actually was looking,0,10,0.227264,0.0,0.1,0.01355,0.0,0.0
7,Accucheck,Damage in your accu chek active strip,0,7,0.565731,-0.1333333,0.6,0.009485,1.0,-0.128
8,Accucheck,The reading is very accurate,1,5,0.544357,0.52,0.823333,0.006775,0.0,0.0
9,Accucheck,I PREFER A STRIP THAT DRAW BLOOD .INSTEAD OF A...,0,19,0.55039,0.5,0.5,0.02439,0.0,-0.2732


### Ranking is a canonical problem for humans. It is easy to classify whether a review is useful (informative) or not. However, ranking reviews on the basis of usefulness, is a complex task. Our ranking methodology is based on this simple education.

#### Pairwise ranking approach is applied to rank reviews in the semi-supervised learning method. The pairwise ranking approach looks at a pair of documents at a time in a loss function and predicts a relative ordering. The objective is not to determine the relevance score but to find which document is more relevant than others. This relevance is developed to judge the preference of one review over another.
#### In the semi-supervised learning method, mapping is constructed between input and output. This input-output pair in the training model is used to learn the system.
#### Review Segregation: We segregated two sets of reviews on which we train our model.
+ Set 0 represents reviews with label 0, i.e., ones that are not informative. These include reviews based on delivery, customer support, packaging, etc. These reviews do not describe the product.
+ Set 1 represents reviews with label 1, i.e., reviews that are informative and are better than all reviews of Set 0;
#### How we segregated and determined labels for reviews:
### `Our entire review ranking system is based on the idea that it is easier for humans to binary classify reviews which we call Set 0 and Set 1.`

For each product 'Accucheck', 'Becadexamin', 'Evion', 'Neurobion','SevenseascodLiverOil', 'Shelcal', 'Supradyn','shampoo', we asked 10 different people to label reviews as a 1 (informative review) and 0 ( not informative review). Different participants were asked to label so that there is no bias and the model learns to its best.

In [123]:
data_split = pd.crosstab(df['product'],df['label'])
data_split

label,0,1
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Accucheck,310,85
Becadexamin,53,27
Evion,89,33
Neurobion,280,136
SevenseascodLiverOil,59,22
Shelcal,259,124
Supradyn,50,23
shampoo,56,49


In [154]:
 data_split[data_split.index == 'Accucheck'][1]

product
Accucheck    85
Name: 1, dtype: int64

## Building the training set:
#### We pairwise compared each review of set1 with all reviews of set0 and vice-versa
+ (Rx, Ry,1) where x∈Set1 and y∈Set0 → Rx is better than Ry
+ (Ry, Rx, 0) where x∈Set1 and y∈Set0 → Ry is worst than Rx
<br>

#### This now becomes a classification problem.

<hr>

![PairwiseRanking](Photos/PairwiseRanking.png)

In [124]:
def building_training_data(df):
    A = df[df['label']==1]
    A.loc[df['label']==1,'join'] = 'j'
    B = df[df['label']==0]
    B.loc[df['label']==0,'join'] = 'j'
    trainset1 = pd.merge(A,B,how='outer',on='join')
    trainset2 = pd.merge(B,A,how='outer',on ='join')

    trainset = pd.merge(trainset1,trainset2,how='outer')
    return trainset,trainset1,trainset2,A,B

In [125]:
product_list = df['product'].unique()
data_stack = []
for product in product_list:
    temp = deepcopy(df[df['product']==product].iloc[:,2:])
    build_data,t1,t2,A,B = building_training_data(temp)
    print(product, len(temp), len(build_data))
    build_data.drop(columns = ['join','label_y'],inplace=True)
    data = build_data.iloc[:,1:]
    data['target'] = build_data.iloc[:,0]
    data_stack.append(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Accucheck 395 52700
Becadexamin 80 2862
Evion 122 5874


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Neurobion 416 76160
SevenseascodLiverOil 81 2596


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Shelcal 383 64232
Supradyn 73 2300
shampoo 105 5488


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [126]:
#build_data

#pd.set_option = (1000,1000)

In [127]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [160]:
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 200


In [129]:
t2.head()

Unnamed: 0,label_x,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,join,label_y,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y
0,0,7,0.540337,1.0,0.3,0.016706,0.0,0.7717,j,1,12,0.681312,-0.1,0.4,0.026253,0.0,0.1891
1,0,7,0.540337,1.0,0.3,0.016706,0.0,0.7717,j,1,8,0.407447,0.2,0.2,0.019093,0.0,0.6865
2,0,7,0.540337,1.0,0.3,0.016706,0.0,0.7717,j,1,9,0.486926,0.0,0.0,0.02148,0.0,0.5574
3,0,7,0.540337,1.0,0.3,0.016706,0.0,0.7717,j,1,6,0.499672,0.7,0.6,0.01432,0.0,0.0516
4,0,7,0.540337,1.0,0.3,0.016706,0.0,0.7717,j,1,12,0.408704,0.0,0.0,0.02864,0.0,0.1027


In [130]:
train = pd.concat(data_stack).reset_index(drop = True)

In [131]:
data_stack[7].shape

(5488, 15)

In [132]:
X = train.iloc[:,:-1].values
y = train.iloc[:,-1].values

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle = True, stratify = y) 
print("Test Len:",len(X_test)," ",len(y_test))

Test Len: 42443   42443


In [133]:
train

Unnamed: 0,review_len_x,Rn_x,Rp_x,Rs_x,Rc_x,Rd_x,Rsc_x,review_len_y,Rn_y,Rp_y,Rs_y,Rc_y,Rd_y,Rsc_y,target
0,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,4,0.232859,0.300000,0.616667,0.005420,1.0,0.0000,1
1,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,14,1.017469,-0.100000,0.400000,0.017615,1.0,0.0000,1
2,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,5,0.319747,0.600000,0.800000,0.006775,1.0,0.4215,1
3,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.545833,0.700000,0.600000,0.002710,0.0,0.4404,1
4,5,0.544357,0.52,0.823333,0.006775,0.0,0.0000,2,0.545833,0.700000,0.600000,0.002710,0.0,0.4404,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212207,10,0.418828,0.50,0.500000,0.023866,0.0,-0.1263,15,0.857801,0.700000,0.600000,0.035800,0.0,0.1531,0
212208,10,0.418828,0.50,0.500000,0.023866,0.0,-0.1263,7,0.223776,0.316667,0.600000,0.016706,0.0,0.2382,0
212209,10,0.418828,0.50,0.500000,0.023866,0.0,-0.1263,25,0.578632,0.233333,0.255556,0.059666,0.0,0.5927,0
212210,10,0.418828,0.50,0.500000,0.023866,0.0,-0.1263,10,0.475974,0.500000,0.500000,0.023866,0.0,0.1779,0


# Spot Checking-
+ Linear Model
+ Non-Linear Model
+ Ensemble Model

<hr>

## Linear Model: Logistic Regression

In [134]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))



Training Accuracy
 0.8499431580559466
Test Accuracy
 0.8513771411068963
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     84885
           1       0.85      0.85      0.85     84884

    accuracy                           0.85    169769
   macro avg       0.85      0.85      0.85    169769
weighted avg       0.85      0.85      0.85    169769

Test 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     21221
           1       0.85      0.85      0.85     21222

    accuracy                           0.85     42443
   macro avg       0.85      0.85      0.85     42443
weighted avg       0.85      0.85      0.85     42443



### Accuracy: 85%
### F1-score: 85%

## Non-Linear Model: DecisionTree

In [135]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

Training Accuracy
 0.9967485229930082
Test Accuracy
 0.9862874914591334
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     84885
           1       1.00      0.99      1.00     84884

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     21221
           1       0.99      0.99      0.99     21222

    accuracy                           0.99     42443
   macro avg       0.99      0.99      0.99     42443
weighted avg       0.99      0.99      0.99     42443



## Ensemble Model: RandomForest

In [136]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=50, n_jobs = -1, oob_score = True,random_state=42)
classifier.fit(X_train,y_train)

print("Training Accuracy\n", accuracy_score(y_train,classifier.predict(X_train)))
print("Test Accuracy\n", accuracy_score(y_test,classifier.predict(X_test)))

print('CLASSIFICATION REPORT')
print("Training\n", classification_report(y_train,classifier.predict(X_train)))
print("Test \n", classification_report(y_test,classifier.predict(X_test)))

print("Test\nConfusion Matrix: \n", confusion_matrix(y_test, classifier.predict(X_test)))

Training Accuracy
 0.9967485229930082
Test Accuracy
 0.989986570223594
CLASSIFICATION REPORT
Training
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     84885
           1       1.00      1.00      1.00     84884

    accuracy                           1.00    169769
   macro avg       1.00      1.00      1.00    169769
weighted avg       1.00      1.00      1.00    169769

Test 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     21221
           1       0.99      0.99      0.99     21222

    accuracy                           0.99     42443
   macro avg       0.99      0.99      0.99     42443
weighted avg       0.99      0.99      0.99     42443

Test
Confusion Matrix: 
 [[21013   208]
 [  217 21005]]


In [137]:
## Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when oob_score is True.
classifier.oob_score_

0.9892147565220977

In [138]:
feature_importances = pd.DataFrame(classifier.feature_importances_,
                                   index = train.iloc[:,:-1].columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
Rd_x,0.188866
Rd_y,0.186374
review_len_x,0.117197
review_len_y,0.106647
Rc_y,0.074865
Rc_x,0.069707
Rsc_y,0.043662
Rn_y,0.042769
Rn_x,0.038853
Rsc_x,0.038356


In [139]:
dump(classifier, 'randomforest.joblib', compress = 2)

['randomforest.joblib']

## RandomForest Classifier Weights Saved. 
### Accuracy: 0.98
### oob_score: 0.98

+ Note, if in your usecase data is too small to split to train-test-split then one can train model on entire data and measure out of bag score. 

<hr>

## PART 2. Model Ranking Metric

### Accuracy of Ranking Methodology
+ After sorting the reviews by the review score, we wanted all reviews in Set 1 to be above all reviews of Set 0.
+ To test this hypothesis, we developed the following Ranking Metric
+ Let the number of 1s in our Dataset be x.
### `Ranking Accuracy on Single Product = Number of 1s found in first x positions / x`

In [140]:
classifier = load('randomforest.joblib')

In [159]:
classifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [149]:
product_list = df['product'].unique()
df['win']=0
df['lose']=0
df['review_score'] = 0.0
df.reset_index(inplace = True, drop = True)


def score_giver(C,D):
    E = pd.merge(C,D,how='outer',on='j')
    E.drop(columns=['j'],inplace = True)
    q= classifier.predict(E.values)
    return Counter(q),E

for product in product_list:
    data = df[df['product']==product]
    for indx in data.index:
        review = df.iloc[indx, 3:-3]
        review['j'] = 'jn'
        C = pd.DataFrame([review])
        D = data[data.index!=indx].iloc[:,3:-3]
        D['j'] = 'jn'
        score,E = score_giver(C,D)
        df.at[indx, 'win'] = 0 if score.get(1) is None else score.get(1)
        df.at[indx, 'lose'] = 0 if score.get(0) is None else score.get(0)
        df.at[indx, 'review_score'] = float(0 if score.get(1) is None else score.get(1)) / len(data) * 1.0

df = df.sort_values(by = ['product','review_score'], ascending = False)

r_accuracy =[]
for product in product_list:
    x = data_split[data_split.index == product][1][0]
    number_of_1_in_x = Counter(df[df['product']==product].iloc[:x, ]['label']).get(1)
    rank_accuracy = float(number_of_1_in_x*1.0 / x*1.0)
    print("Product: {} | Rank Accuracy: {}".format(product, rank_accuracy))
    r_accuracy.append(rank_accuracy)
print("Mean Rank Accuracy: {}".format(mean(r_accuracy)))

Product: shampoo | Rank Accuracy: 1.0
Product: Supradyn | Rank Accuracy: 1.0
Product: Shelcal | Rank Accuracy: 0.9516129032258065
Product: SevenseascodLiverOil | Rank Accuracy: 1.0
Product: Neurobion | Rank Accuracy: 0.9191176470588235
Product: Evion | Rank Accuracy: 1.0
Product: Becadexamin | Rank Accuracy: 1.0
Product: Accucheck | Rank Accuracy: 0.9647058823529412
Mean Rank Accuracy: 0.9794295540796965


In [163]:
len(df[df['product']==product].iloc[:x, ]['label'])

85

In [156]:
df.iloc[:, [0,1,-1]].to_csv('data/train_ranked_output.csv',index = False)

In [157]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [158]:
t = pd.read_csv('data/test.csv')