In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import pipeline,metrics,decomposition
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.linear_model import LogisticRegression

In [3]:
def confusion_matrix(rate_a,rate_b,min_rating=None,max_rating=None):
    assert(len(rate_a)==len(rate_b))
    if min_rating is None:
        min_rating=min(rate_a+rate_b)
    if max_rating is None:
        max_rating=max(rate_a+rate_b)
    num_rating=int(max_rating-min_rating+1)
    confu_matrix=[[0 for i in range(num_rating)]
                 for j in range(num_rating)]
    for a,b in zip(rate_a,rate_b):
        confu_matrix[a-min_rating][b-min_rating]+=1
    return confu_matrix

In [4]:
def histogram(rating,min_rating=None,max_rating=None):
    if min_rating is None:
        min_rating=min(rating)
    if max_rating is None:
        max_rating=max(rating)
    num_rating=int(max_rating-min_rating+1)
    hist=[0 for i in range(num_rating)]
    for i in rating:
        hist[i-min_rating]+=1
    return hist

In [5]:
def quadratic_weighted_kappa(y,y_pred):
    rate_a=y
    rate_b=y_pred
    rate_a=np.array(rate_a,dtype=int)
    rate_b=np.array(rate_b,dtype=int)
    min_rating=None
    max_rating=None
    assert(len(rate_a)==len(rate_b))
    if min_rating is None:
        min_rating=min(min(rate_a),min(rate_b))
    if max_rating is None:
        max_rating=max(max(rate_a),max(rate_b))
    conf_max=confusion_matrix(rate_a,rate_b,min_rating,max_rating)
    num_rating=len(conf_max)
    num_scores_item=float(len(rate_a))
    hist_a=histogram(rate_a,min_rating,max_rating)
    hist_b=histogram(rate_b,min_rating,max_rating)
    
    numerator=0.0
    denominator=0.0
    for i in range(num_rating):
        for j in range(num_rating):
            d=pow(i-j,2.0)/pow(num_rating-1,2.0)
            expected=(hist_a[i]*hist_b[j]/num_scores_item)
            numerator+=d*conf_max[i][j]/num_scores_item
            denominator+=d*expected/num_scores_item
    return (1.0-numerator/denominator)

In [8]:
if __name__=='__main__':
    train=pd.read_csv('train.csv')
    test=pd.read_csv('test.csv')
    #print(train)
    idx=test.id.values.astype(int)
    train=train.drop(labels='id',axis=1)
    test=test.drop(labels='id',axis=1)
    #print(train)
    y=train.median_relevance.values
    train=train.drop(['median_relevance','relevance_variance'],axis=1)
    #print(train)
    
    traindata=list(train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
    testdata=list(test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
    #print(traindata)
    
    tfv=TfidfVectorizer(strip_accents='unicode',analyzer='word',stop_words='english',
                        token_pattern=r'\w{1,}',ngram_range=(1,5),min_df=3,max_features=None,use_idf=1,
                       smooth_idf=1,sublinear_tf=1)
    tfv.fit(traindata)
    x=tfv.transform(traindata)
    x_test=tfv.transform(testdata)
    #print(x)
    svd=TruncatedSVD()
    scl=StandardScaler()
    svm_model=SVC()
    
    clf=pipeline.Pipeline([('svd',svd),
                          ('scl',scl),
                          ('svm',svm_model)]) 
    
    param_grid={'svd__n_components':[200,400],
               'svm__C':[10,12]}
    
    kappa_scorer=metrics.make_scorer(quadratic_weighted_kappa,greater_is_better=True)
    
    model=GridSearchCV(estimator=clf,param_grid=param_grid,scoring=kappa_scorer,
                       n_jobs=-1,iid=True,cv=2,refit=True,verbose=10)
    model.fit(x,y)
    print('Best scores : %0.3f' % model.best_score_)
    print('Best parameter :')
    best_param=model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_param[param_name]))
    
    best_model=model.best_estimator_
    best_model.fit(x,y)
    pred=best_model.predict(x_test)
    solution = pd.DataFrame({"id": idx, "prediction": pred})
    solution.to_csv("solution.csv", index=False)

  (0, 15657)	0.261742125188547
  (0, 15656)	0.261742125188547
  (0, 15655)	0.1956813789955167
  (0, 15652)	0.1900282943976819
  (0, 13949)	0.2377917675212798
  (0, 13946)	0.14414965478495423
  (0, 12714)	0.1557160855343555
  (0, 8359)	0.2685203837650798
  (0, 8355)	0.1811466520220755
  (0, 5546)	0.18429960671355355
  (0, 5476)	0.261742125188547
  (0, 5475)	0.261742125188547
  (0, 5472)	0.194998738484414
  (0, 3083)	0.261742125188547
  (0, 3082)	0.261742125188547
  (0, 3081)	0.1956813789955167
  (0, 3079)	0.1956813789955167
  (0, 3078)	0.19433110228300043
  (0, 2531)	0.09900061546307047
  (0, 1490)	0.261742125188547
  (0, 1489)	0.2083450387130452
  (1, 18641)	0.21354786834034664
  (1, 17565)	0.19012113556081284
  (1, 15165)	0.11336238974799137
  (1, 12035)	0.2508803544565089
  :	:
  (10156, 18162)	0.3363994745465213
  (10156, 16052)	0.30536722776026076
  (10156, 12860)	0.39873593048708295
  (10156, 6952)	0.3525856061591716
  (10156, 6950)	0.32808536255146276
  (10156, 3180)	0.5803297497

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   41.0s remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:   41.3s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:   42.1s remaining:   42.1s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  2.0min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:  2.1min remaining:   41.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.1min finished


Best scores : 0.556
Best parameter :
	svd__n_components: 400
	svm__C: 12
