In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.metrics import precision_score
from math import sqrt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

In [2]:
train_df = pd.read_csv('./train.csv', encoding='utf-8')
print(train_df.shape)
train_df = train_df.loc[ :,['name', 'isOrg']]
train_df = train_df[(train_df['isOrg'] == 'True') | (train_df['isOrg'] == 'False')]
print(train_df.shape)

(94880, 9)
(92997, 2)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def remove_punctuation(line):
    return " ".join("".join([" " if ch in string.punctuation else ch for ch in line]).split())

train_df.name = train_df.name.apply(lambda x: remove_punctuation(str(x).lower()))

In [153]:
train_df.name = train_df.name.apply(lambda x: str(x).lower())

In [154]:
train_df.isOrg.value_counts()

True     86840
False     6157
Name: isOrg, dtype: int64

In [139]:
train_df['first_word'] = train_df['name'].apply(lambda x: x.split()[0]
                                                if len(x) > 0 else [])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_df['name'], train_df.isOrg, test_size=0.2, 
                                                    random_state=666, stratify=train_df["isOrg"])

In [69]:
tfidf_svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                               ('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=0.00001,
                                                     random_state=666, max_iter=15, class_weight=None)),
                               ])

parameters_svm = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'svm__alpha': (1e-4, 1e-5, 1e-6),
                  'svm__class_weight': (None, "balanced")
                  }

gs_svm = GridSearchCV(tfidf_svm_pipeline, parameters_svm, scoring="average_precision", 
                      n_jobs=-1, cv=5, verbose=10, return_train_score=True)

gs_svm = gs_svm.fit(X_train, y_train.apply(lambda x: 1 if x=='True' else 0))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed:  1.2min remaining:    1.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.2min finished


In [70]:
pd.DataFrame(gs_svm.cv_results_)[["mean_test_score",
                                  "param_tfidf__ngram_range",
                                  "param_tfidf__use_idf",
                                  "param_svm__alpha",
                                  "param_svm__class_weight"]].sort_values(by=["mean_test_score"], ascending=False)

Unnamed: 0,mean_test_score,param_tfidf__ngram_range,param_tfidf__use_idf,param_svm__alpha,param_svm__class_weight
9,0.999991,"(1, 1)",False,1e-05,
8,0.999991,"(1, 1)",True,1e-05,
12,0.999991,"(1, 1)",True,1e-05,balanced
11,0.999991,"(1, 2)",False,1e-05,
13,0.999991,"(1, 1)",False,1e-05,balanced
4,0.99999,"(1, 1)",True,0.0001,balanced
5,0.99999,"(1, 1)",False,0.0001,balanced
10,0.99999,"(1, 2)",True,1e-05,
15,0.99999,"(1, 2)",False,1e-05,balanced
14,0.99999,"(1, 2)",True,1e-05,balanced


In [15]:
pipeline_cv_svm = Pipeline([('cv', CountVectorizer(ngram_range=(1, 1))),
                            ('svm', SGDClassifier(loss='hinge',
                                                  penalty='l2',
                                                  alpha=0.0001,
                                                  random_state=666,
                                                  max_iter=15,
                                                  class_weight=None)),
                           ])

print('cross_val_score = ', cross_val_score(pipeline_cv_svm, X_train, y_train.apply(lambda x: 1 if x=='True' else 0),
                      scoring="average_precision", cv=5, n_jobs=-1).mean())

pipeline_cv_svm.fit(X_train, y_train)
predictionSVM = pipeline_cv_svm.predict(X_test)
label1_score = precision_score(y_test, predictionSVM, pos_label='True')
label0_score = precision_score(y_test, predictionSVM, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

cross_val_score =  0.9999906322490453
label1_score =  0.9968423469973591
label0_score =  0.9949238578680203
squared and multiplied 0.9958826404556033


In [9]:
pipeline_tfidf_svm = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 1),
                                                         use_idf=True)),
                               ('svm', SGDClassifier(loss='hinge', 
                                                     penalty='l2', 
                                                     alpha=0.00001,
                                                     random_state=666, 
                                                     max_iter=15, 
                                                     class_weight=None)),
                               ])


print('cross_val_score = ', cross_val_score(pipeline_tfidf_svm, X_train, y_train.apply(lambda x: 1 if x=='True' else 0),
                scoring="average_precision", cv=5, n_jobs=1).mean())


pipeline_tfidf_svm.fit(X_train, y_train)
predictionSVM = pipeline_tfidf_svm.predict(X_test)
label1_score = precision_score(y_test, predictionSVM, pos_label='True')
label0_score = precision_score(y_test, predictionSVM, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

cross_val_score =  0.9999907595056274
label1_score =  0.997128252254322
label0_score =  0.9932716568544996
squared and multiplied 0.9951980864195238


In [10]:
from collections import defaultdict
from math import log

class NBClassifier(object):
    def __init__(self, alpha=10**(-7)):
        self.alpha = alpha 
        self.classes = defaultdict(lambda:.0)
        self.prob = defaultdict(lambda:.0)
        
    def fit(self, data_set):
        for feats, label in data_set:
            self.classes[label] += 1
            for feat in feats:
                self.prob[label, feat] += 1
                
        for label, feat in self.prob:   
            self.prob[label, feat] /= self.classes[label]
        for cl in self.classes:
            self.classes[cl] /= len(data_set)
        return self
    
    def get_class(self, feats):
        def get_log_prob(cl):
            return -log(self.classes[cl]) + \
                   sum(-log(self.prob.get((cl,feat), self.alpha)) for feat in feats)
        return min(self.classes.keys(), key=get_log_prob)
    
    def predict(self, data_to_predict):
        return [self.get_class(nltk.word_tokenize(words)) for words in data_to_predict]

In [11]:
data_to_fit = [(nltk.word_tokenize(words), label) for words, label in zip(X_train, y_train)]

NBCpredicted = NBClassifier().fit(data_to_fit).predict(X_test)
label1_score = precision_score(y_test, NBCpredicted, pos_label='True')
label0_score = precision_score(y_test, NBCpredicted, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

label1_score =  0.9978153386225135
label0_score =  0.9892205638474295
squared and multiplied 0.9935086571780724


In [75]:
from nltk.stem.snowball import RussianStemmer

stemmer = RussianStemmer()
def stem_all_words(name: str):
    return " ".join([stemmer.stem(token) for token in name.split()])

X_train = X_train.apply(stem_all_words)
X_test = X_test.apply(stem_all_words)

data_to_fit = [(nltk.word_tokenize(words), label) for words, label in zip(X_train, y_train)]

NBCpredicted = NBClassifier().fit(data_to_fit).predict(X_test)
label1_score = precision_score(y_test, NBCpredicted, pos_label='True')
label0_score = precision_score(y_test, NBCpredicted, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

label1_score =  0.997127592347906
label0_score =  0.9899413243922883
squared and multiplied 0.9935279610343032


In [61]:
unknown_df = pd.read_csv('./test.csv', encoding='utf-8')
for i in range(2,6):
    col = 'Unnamed: {}'.format(i)
    unknown_df['name'] += unknown_df[col].apply(lambda x: '' if str(x) == 'nan' else str(x))
    
unknown_names = unknown_df['name'].apply(lambda x: remove_punctuation(str(x).lower()))

In [92]:
NBCpredicted = NBClassifier().fit(data_to_fit).predict(unknown_names)
pd.DataFrame({'name': unknown_df.name,'prediction': NBCpredicted}).to_csv('prediction.csv', index=False)

In [90]:
SVMpred = pipeline_cv_svm.predict(unknown_df.name.apply(lambda x: str(x).lower()))
SVMpred_tfidf =  pipeline_tfidf_svm.predict(unknown_df.name.apply(lambda x: str(x).lower()))

In [93]:
pd.DataFrame({'name': unknown_df.name,'prediction': SVMpred_tfidf}).to_csv('prediction_svm.csv', index=False)

In [159]:
from sklearn.naive_bayes import MultinomialNB

p = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer()),
        ('nb', MultinomialNB(alpha=0.0001))
    ])

p.fit(X_train, y_train)
predicted = p.predict(X_test)
label1_score = precision_score(y_test, predicted, pos_label='True')
label0_score = precision_score(y_test, predicted, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

label1_score =  0.9967243261881501
label0_score =  0.9791492910758965
squared and multiplied 0.9878977261767676


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
p = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_estimators=1000))
    ])

p.fit(X_train, y_train)
predicted = p.predict(X_test)
label1_score = precision_score(y_test, predicted, pos_label='True')
label0_score = precision_score(y_test, predicted, pos_label='False')
print('label1_score = ', label1_score)
print('label0_score = ', label0_score)
print('squared and multiplied', sqrt(label1_score*label0_score))

label1_score =  0.9952976258745269
label0_score =  0.9888123924268503
squared and multiplied 0.9920497097513586
