# Binary classfication

In [217]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

In [157]:
import gensim
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.metrics import classification_report

In [178]:
SEED = 1337

### Home task: Spam detection

Для заданной тестовой выборки построить модель для предсказания является ли sms сообщение спамом.  
На заданном разбиении (df_train, df_test) ваша модель должна превзойти baseline'ы, приведенные ниже.  

Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества F1


baseline 1: 0.9444      bag of words + Multinomial Naive Bayes  
baseline 2: 0.9490      symbol 3-grams with IDF and l2-norm + Logistic Regression  
baseline 3: 0.9636      text stemming + baseline 2  


! Your results must be reproducible. Если ваша модель - стохастическая (как например LogisticRegression), то вы явно должны задавать все seed и random_state в параметрах моделей  
! Вы должны использовать df_test только для измерения качества конечной обученной модели. 

In [234]:
#load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [235]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 400

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [236]:
# m = '/Users/Valeriya/Курсач/ruwikiruscorpora_0_300_20.bin.gz'
# w2v = gensim.models.KeyedVectors.load_word2vec_format(m, binary=True)
model_w2v = Word2Vec(df['text'], size=200, window=5, min_count=2, workers=4)

In [237]:
classifiers = [LogisticRegression(random_state=SEED, penalty= 'l2', C= 200000)] # взяла параметры из семинара

In [238]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.3, random_state=SEED)

In [239]:
label_enc = preprocessing.LabelEncoder().fit(y_train)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

In [240]:
# y_train

In [241]:
for cl2 in classifiers:
    clf_grad_w2v_tfidf = Pipeline([
        ("word2vec vectorizer", TfidfEmbeddingVectorizer(model_w2v)),
        ("extra trees", cl2)
    ])
    clf_grad_w2v_tfidf.fit(X_train, y_train)
    predicted = clf_grad_w2v_tfidf.predict(X_test)
    print(classification_report(y_test, predicted))
    print('______________')

             precision    recall  f1-score   support

          0       0.98      0.99      0.99      1436
          1       0.94      0.88      0.91       236

avg / total       0.97      0.97      0.97      1672

______________
