In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC, LinearSVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, log_loss
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.calibration import CalibratedClassifierCV
from sklearn.externals import joblib
import pickle
import jieba

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def save_model(Vectorizer, Vectorizer_filename, Model, Model_filename):
    print('Save model...')
    print(joblib.dump(Vectorizer, 'model/' + Vectorizer_filename + '.model'))
    print(joblib.dump(Model, 'model/'+ Model_filename +'.model'))

In [3]:
df_train = pd.read_csv('data/train.csv', lineterminator='\n')
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


In [4]:
print(len(df_train['review']))
type(df_train['review'])

6328


pandas.core.series.Series

In [5]:
# jieba.enable_parallel(64)
# df_train['review_cut'] = df_train['review'].apply(lambda x : jieba.cut(x))
# df_train['review'] = [' '.join(x) for x in df_train['review_cut']]

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stoplist = stopwords.words('english')
sent_list = []
for sent in df_train['review']:
    word_list = word_tokenize(sent.lower())
    word_list = [word for word in word_list if word not in stoplist]
    sent_list.append(' '.join(word_list))
df_train['review'] = sent_list

In [8]:
#df_train.to_csv('train_after_preprocess.csv')

In [9]:
X = np.array([review for review in df_train['review']])
Y = np.array([1 if label == 'Positive' else 0 for label in df_train['label']])

In [10]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.3, shuffle=True)
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)

(4429,)
(4429,)
(1899,)
(1899,)


In [11]:
tfidf = TfidfVectorizer(min_df=2, max_features=3000, ngram_range=(1, 5), use_idf=True, smooth_idf=True, token_pattern=r"(?u)\b\w+\b")
tfidf.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=2,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
#print(tfidf.vocabulary_)

In [13]:
X_train_vec = tfidf.transform(X_train)
X_valid_vec = tfidf.transform(X_valid)
X_train_all = tfidf.transform(X)
print(X_train_vec.shape)
print(X_valid_vec.shape)
print(X_train_all.shape)

(4429, 3000)
(1899, 3000)
(6328, 3000)


In [14]:
clf =  CalibratedClassifierCV(LinearSVC(), cv = 5) 
clf.fit(X_train_vec, Y_train)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=5, method='sigmoid')

In [15]:
print('The ACC on train set is ' + str(clf.score(X_train_vec, Y_train)))
Y_train_predict = clf.predict_proba(X_train_vec)
Y_train_predict_postive = np.array([item[1] for item in Y_train_predict])
print('AUC = ' + str(roc_auc_score(Y_train, Y_train_predict_postive)))

print('The ACC on test set is ' + str(clf.score(X_valid_vec, Y_valid)))
Y_valid_predict = clf.predict_proba(X_valid_vec)
Y_valid_predict_postive = np.array([item[1] for item in Y_valid_predict])
print('AUC = ' + str(roc_auc_score(Y_valid, Y_valid_predict_postive)))

The ACC on train set is 0.9286520659291037
AUC = 0.9802420463264945
The ACC on test set is 0.7398630858346498
AUC = 0.8173589131648424


In [16]:
#save_model(Vectorizer=tfidf, Vectorizer_filename= 'tfidf', Model= clf, Model_filename= 'SVC')

In [17]:
clf = MultinomialNB()
clf.fit(X_train_vec, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
print('The ACC on train set is ' + str(clf.score(X_train_vec, Y_train)))
Y_train_predict = clf.predict_proba(X_train_vec)
Y_train_predict_postive = np.array([item[1] for item in Y_train_predict])
print('AUC = ' + str(roc_auc_score(Y_train, Y_train_predict_postive)))

print('The ACC on test set is ' + str(clf.score(X_valid_vec, Y_valid)))
Y_valid_predict = clf.predict_proba(X_valid_vec)
Y_valid_predict_postive = np.array([item[1] for item in Y_valid_predict])
print('AUC = ' + str(roc_auc_score(Y_valid, Y_valid_predict_postive)))

The ACC on train set is 0.8475953939941296
AUC = 0.9266698935140368
The ACC on test set is 0.760926803580832
AUC = 0.8426547879332472


In [19]:
clf = xgb.XGBClassifier(max_depth=10, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.7, nthread=10, learning_rate=0.1)
clf.fit(X_train_vec, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=10, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [20]:
print('The ACC on train set is ' + str(clf.score(X_train_vec, Y_train)))
Y_train_predict = clf.predict_proba(X_train_vec)
Y_train_predict_postive = np.array([item[1] for item in Y_train_predict])
print('AUC = ' + str(roc_auc_score(Y_train, Y_train_predict_postive)))

print('The ACC on test set is ' + str(clf.score(X_valid_vec, Y_valid)))
Y_valid_predict = clf.predict_proba(X_valid_vec)
Y_valid_predict_postive = np.array([item[1] for item in Y_valid_predict])
print('AUC = ' + str(roc_auc_score(Y_valid, Y_valid_predict_postive)))

The ACC on train set is 0.8859787762474599
AUC = 0.9562609920036182
The ACC on test set is 0.7230121116377041
AUC = 0.7902631011189971


  if diff:
  if diff:


In [21]:
clf = MultinomialNB()
clf.fit(X_train_all, Y)
Y_train_all_predict = clf.predict_proba(X_train_all)
Y_train_all_predict_postive = np.array([item[1] for item in Y_train_all_predict])
print('AUC = ' + str(roc_auc_score(Y, Y_train_all_predict_postive)))

AUC = 0.9125389700270364


In [22]:
df_test = pd.read_csv("data/test.csv", lineterminator='\n')
df_test.head()

Unnamed: 0,ID,review
0,1,Phr tissuw se saaf
1,2,Jail Road Per Firing Se 1 Shakhs Janbahaq
2,3,mehfil loot li aunty ne
3,4,Rehnay do butt sahb nay galiya boht deni hain
4,5,Zabardast


In [23]:
sent_list = []
for sent in df_test['review']:
    word_list = word_tokenize(sent.lower())
    word_list = [word for word in word_list if word not in stoplist]
    sent_list.append(' '.join(word_list))
df_test['review'] = sent_list

In [24]:
X_test = np.array([review.lower() for review in df_test['review']])
X_test_vec = tfidf.transform(X_test)
Y_predict = clf.predict_proba(X_test_vec)
Y_predict_positive = [item[1] for item in Y_predict]
test_ids = df_test['ID']
Data = {'ID':test_ids, 'Pred':Y_predict_positive}
result = pd.DataFrame(Data, columns=['ID', 'Pred'])
result.to_csv('test_pred.csv', header = True)
result.head()

Unnamed: 0,ID,Pred
0,1,0.338767
1,2,0.006984
2,3,0.517314
3,4,0.369707
4,5,0.877772
