In [42]:
#create_folds.py
import pandas as pd
from sklearn import model_selection

df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df = df.head(10000)
df['id'] = df.index
df = df.replace(to_replace=['positive', 'negative'], value=[1, 0])
df.loc[:,'kfold'] = -1
df = df.sample(frac=1).reset_index(drop=True)

y=df.sentiment.values
skf=model_selection.StratifiedKFold(n_splits=5)

for f,(t_,v_) in enumerate(skf.split(X=df,y=y)):
    df.loc[v_,'kfold'] = f


In [None]:
df.kfold.value_counts()

In [None]:
len(df)

In [None]:
#logistics regression model base on TfidfVectorizer data
#lr.py

import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

def run_training(fold):
    #df = pd.read_csv('./train_folds.csv')
    df.review = df.review.apply(str)
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.review .values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = linear_model.LogisticRegression()
    clf.fit(xtrain,ytrain)
    pred = clf.predict_proba(xvalid)[:,1]
    
    auc = metrics.roc_auc_score(yvalid,pred)
    print(f"fold={fold},auc={auc}")
    
    df_valid.loc[:,"lr_pred"] = pred
    return df_valid[['id','sentiment','kfold','lr_pred']]
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)

In [None]:
fin_valid_df.to_csv('./lr.csv')

In [None]:
#logistics regression model base on CountVectorizer data
#lr_cnt.py


import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

def run_training(fold):
    #df = pd.read_csv('./train_folds.csv')
    df.review = df.review.apply(str)
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = CountVectorizer()
    tfv.fit(df_train.review .values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = linear_model.LogisticRegression()
    clf.fit(xtrain,ytrain)
    pred = clf.predict_proba(xvalid)[:,1]
    
    auc = metrics.roc_auc_score(yvalid,pred)
    print(f"fold={fold},auc={auc}")
    
    df_valid.loc[:,"lr_cnt_pred"] = pred
    return df_valid[['id','sentiment','kfold','lr_cnt_pred']]
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)

In [None]:
fin_valid_df.to_csv('./lr_cnt.csv')

In [None]:
#Random Forest model base on SVD data
#rf_svd.py


import pandas as pd
from sklearn import decomposition 
from sklearn import ensemble
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics


def run_training(fold):
    #df = pd.read_csv('./train_folds.csv')
    df.review = df.review.apply(str)
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    tfv = TfidfVectorizer()
    tfv.fit(df_train.review .values)
    
    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)
    
    svd = decomposition.TruncatedSVD(n_components=120)
    svd.fit(xtrain)
    xtrain_svd = svd.transform(xtrain)
    xvalid_svd = svd.transform(xvalid)
    
    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values
    
    clf = ensemble.RandomForestClassifier(n_estimators=100,n_jobs=-1)
    clf.fit(xtrain_svd,ytrain)
    pred = clf.predict_proba(xvalid_svd)[:,1]
    
    auc = metrics.roc_auc_score(yvalid,pred)
    print(f"fold={fold},auc={auc}")
    
    df_valid.loc[:,"rf_svd_pred"] = pred
    return df_valid[['id','sentiment','kfold','rf_svd_pred']]
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)


In [None]:
fin_valid_df.to_csv('./rf_svd.csv')

In [43]:
#blending.py
df = pd.read_csv('../input/model-predblending-eg/lr.csv')
df2 = pd.read_csv('../input/model-predblending-eg/lr_cnt.csv')
df3 = pd.read_csv('../input/model-predblending-eg/rf_svd.csv')

df = df.merge(df2,on='id',how='left')
df = df.merge(df3,on='id',how='left')

In [7]:
pred_cols = ['lr_pred','lr_cnt_pred','rf_svd_pred']
for col in pred_cols:
    auc = metrics.roc_auc_score(df.sentiment.values,df[col].values)
    print(f'pred_cols={col},overall auc={auc}')

In [8]:
#average
import numpy as np
targets = df.sentiment.values
avg_pred = np.mean(df[['lr_pred','lr_cnt_pred','rf_svd_pred']].values,axis=1)
avg_auc = metrics.roc_auc_score(targets,avg_pred)
print(avg_auc)

In [9]:
#random weighted averaging
lr_pred = df.lr_pred.values
lr_cnt_pred = df.lr_cnt_pred.values
rf_svd_pred = df.rf_svd_pred.values
avg_pred = (2*lr_pred+lr_cnt_pred+rf_svd_pred)/4
print(metrics.roc_auc_score(targets,avg_pred))

In [10]:
#random weighted averaging
lr_pred = df.lr_pred.values
lr_cnt_pred = df.lr_cnt_pred.values
rf_svd_pred = df.rf_svd_pred.values
avg_pred = (4*lr_pred+lr_cnt_pred+rf_svd_pred)/6
print(metrics.roc_auc_score(targets,avg_pred))

In [11]:
#rank averaging
lr_pred = df.lr_pred.rank().values
lr_cnt_pred = df.lr_cnt_pred.rank().values
rf_svd_pred = df.rf_svd_pred.rank().values
avg_pred = (lr_pred+lr_cnt_pred+rf_svd_pred)/3
print(metrics.roc_auc_score(targets,avg_pred))

In [21]:
#finding best weights--optimization function
from scipy.optimize import fmin
from functools import partial
class OptimizeAUC:
    
    def __init__(self):
        self.coef_ = 0
        
    def _auc(self,coef,X,y):
        x_coef = X*coef
        predictions = np.sum(x_coef,axis=1)
        auc_score = metrics.roc_auc_score(y,predictions)
        return -1.0*auc_score
        
    def fit(self,X,y):
        partial_loss = partial(self._auc,X=X,y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[1]))
        self.coef_ = fmin(partial_loss,init_coef,disp=True)
        
    def predict(self,X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef,axis=1)
        return predictions
    

In [34]:
def run_training(pred_df,fold):
    
    train_df = pred_df[pred_df.kfold !=fold].reset_index(drop=True)
    valid_df = pred_df[pred_df.kfold ==fold].reset_index(drop=True)
    
    xtrain = train_df[['lr_pred','lr_cnt_pred','rf_svd_pred']].values
    xvalid = valid_df[['lr_pred','lr_cnt_pred','rf_svd_pred']].values
    
    opt = OptimizeAUC()
    opt.fit(xtrain,train_df.sentiment.values)
    preds = opt.predict(xvalid)
    auc = metrics.roc_auc_score(valid_df.sentiment.values,preds)
    print(f"{fold},{auc}")
    
    return opt.coef_

In [39]:
coefs = []
for j in range(5):  
    coefs.append(run_training(df,j))

coefs = np.array(coefs)
print(coefs)

In [40]:
coefs = np.mean(coefs,axis=0)
print(coefs)

In [45]:
wt_avg = (coefs[0]*df.lr_pred.values + 
          coefs[1]*df.lr_cnt_pred.values + 
          coefs[2]*df.rf_svd_pred.values)
print("optimal auc after optimize coefs")
print(metrics.roc_auc_score(targets,wt_avg)) 

In [None]:
#xgb_model.py

import xgboost as xbg
