In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

%matplotlib inline

In [53]:
!iconv -f cp1251 -t utf-8 vk_feed.csv -o vk_feed_utf.csv 

iconv: cannot open input file `vk_feed.csv': No such file or directory


In [58]:
data = pd.read_csv('cleaned_data.csv', index_col=0)
data.head()

Unnamed: 0,text,status,predict_proba,predict,lang
0,ребят краснодар нужны доноры положительная для...,1,0.995379,1,ru
1,волгоград прошу максимальный перепост предыдущ...,1,0.980918,1,ru
2,срочно требуются доноры крови любая цельная кр...,1,0.995874,1,ru
4,анонимно люди крик о помощи срочно а точнее в ...,1,1.0,1,ru
5,люди крик о помощи срочно а точнее в понедельн...,1,1.0,1,ru


In [3]:
from langdetect import detect

def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        return '-'

In [4]:
data.groupby(by='lang').count()

Unnamed: 0_level_0,text,status,predict_proba,predict
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ru,26108,26108,26108,26108


In [5]:
data = data[data['lang'] == 'ru']

In [7]:
import re
from html2text import html2text

def preprocess(text):
    text = html2text(text)
    text = re.sub('[\W_]', ' ', text)
    text = re.sub('_', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('[A-Za-z0-9]', '', text) #removing english
    text = text.lower()
    return text

In [42]:
data['text'] = data['text'].apply(preprocess)
data = data.drop_duplicates()

In [33]:
data.groupby(by='status').count()

Unnamed: 0_level_0,text,predict_proba,predict,lang
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1677,1677,1677,1677
1,22901,22901,22901,22901


In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer

class Preprocessor:
    def __init__(self, stemmer, analyser):
        self.stemmer = stemmer
        self.analyser = analyser
        
    def __call__(self, doc):
        return (self.stemmer.stem(w) for w in self.analyser(doc) if len(w) > 3)
    
preporcessor = Preprocessor(RussianStemmer(), CountVectorizer(ngram_range=(1,2)).build_analyzer())

vectorizer = CountVectorizer(stop_words=stopwords.words('russian'), analyzer=preporcessor)
V = vectorizer.fit_transform(data['text'])

print(V.shape)

(24578, 217040)


In [60]:
X = V
print(X.shape)

y = np.array(data['status'])
print(y.shape)

(24578, 217040)
(24578,)


In [63]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC

kfold = StratifiedKFold(y, n_folds=4, shuffle=True)
model = SVC(kernel='linear', probability=True, C=1)

for train_index, test_index in kfold:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

[LibSVM]0.984217377156
[LibSVM]0.98095703125
[LibSVM]0.984375
[LibSVM]0.982584635417


In [64]:
from sklearn.ensemble import RandomForestClassifier

model.fit(X, y)
print(model.score(X, y))

[LibSVM]0.999877939621


In [65]:
data['predict_proba'] = 1 - model.predict_proba(X)
data['predict'] = model.predict(X)

In [66]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, data['predict'])

array([[ 1679,     2],
       [    1, 22896]])

In [67]:
false_negative = data[(data['predict'] == 0) & (data['status'] == 1)]
false_negative

Unnamed: 0,text,status,predict_proba,predict,lang
1991,нужно больше крови,1,0.194179,0,ru


In [68]:
false_positive = data[(data['predict'] == 1) & (data['status'] == 0)]
false_positive

Unnamed: 0,text,status,predict_proba,predict,lang
20598,срочно нужна кровь срочно,0,0.984184,1,ru
33641,мне нужна кровь,0,0.579167,1,ru


In [69]:
word_vector = vectorizer.transform(['ребят очень нужен донор крови выручите пожалуйта'])
print(model.predict_proba(word_vector)[0][1])

0.984221284183


In [70]:
data.to_csv('cleaned_data.csv')

In [23]:
from dill import dill

with open('model/svm/vectorizer.pkl', 'wb') as f:
    dill.dump(vectorizer, f)

with open('model/svm/model.pkl', 'wb') as f:
    dill.dump(model, f)