In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the data from CSV files
n = ['id', 'date','name','text','typr','rep','rtw','faw','stcount','foll','frien','listcount']
data_positive = pd.read_csv('positive.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])
data_negative = pd.read_csv('negative.csv', sep=';',error_bad_lines=False, names=n, usecols=['text'])

In [None]:
# Create balanced dataset
sample_size = 10000
raw_data = np.concatenate((data_positive['text'].values[:sample_size], 
                           data_negative['text'].values[:sample_size]), axis=0) 
labels = [1]*sample_size + [0]*sample_size

In [None]:
import re
from sklearn.model_selection import train_test_split, GridSearchCV
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

data = [preprocess_text(t) for t in raw_data]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=2)

In [None]:
from keras import backend as K


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

SENTENCE_LENGTH = 26
NUM = 50000

def get_sequences(tokenizer, x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(x_train)

x_train_seq = get_sequences(tokenizer, x_train)
x_test_seq = get_sequences(tokenizer, x_test)

In [None]:
from gensim.models import Word2Vec
# Загружаем обученную модель
w2v_model = Word2Vec.load('tweets_model.w2v')
DIM = w2v_model.vector_size 
# Инициализируем матрицу embedding слоя нулями
embedding_matrix = np.zeros((NUM, DIM))
# Добавляем NUM=100000 наиболее часто встречающихся слов из обучающей выборки в embedding слой
for word, i in tokenizer.word_index.items():
    if i >= NUM:
        break
    if word in w2v_model.wv.vocab.keys():
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:
conv_grams = [3, 4, 5]
conv_layers = [3, 6, 10]
dense_neurons = [15, 30, 60]
conv_activations = ['tanh', 'sigmoid','hard_sigmoid', 'relu']
regularization = [0, 0.1, 0.2]

In [None]:
from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras import optimizers
from keras.layers import Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D

tweet_input = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
tweet_encoder = Embedding(NUM, DIM, input_length=SENTENCE_LENGTH,
                          weights=[embedding_matrix], trainable=False)(tweet_input)

In [None]:
from sklearn.metrics import classification_report
res_arr = []
total_combinations = len(conv_grams) * len(conv_layers) * len(conv_activations) * len(regularization) * len(dense_neurons)
counter = 1

for conv_gram in conv_grams:
    conv_arr = np.arange(2, conv_gram + 1)
    for conv_layer_number in conv_layers:
        for dense_neuron_number in dense_neurons:
            for conv_activation in conv_activations:
                for reg in regularization:
                    branches = []

                    if(reg != 0):
                        x = Dropout(reg)(tweet_encoder)
                    else:
                         x = tweet_encoder
                    for size in conv_arr:
                        for i in range(conv_layer_number):
                            branch = Conv1D(filters=1, kernel_size=size.item(), padding='valid', activation=conv_activation)(x)
                            branch = GlobalMaxPooling1D()(branch)
                            branches.append(branch)

                    x = concatenate(branches, axis=1)
                    if(reg != 0):
                        x = Dropout(reg)(x)
                    x = Dense(dense_neuron_number, activation=conv_activation)(x)
                    x = Dense(1)(x)
                    output = Activation('sigmoid')(x)
                    model = Model(inputs=[tweet_input], outputs=[output])
                    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[precision, recall, f1])
                    model.fit(x_train_seq, y_train, batch_size=32, epochs=3, validation_split=0.25)
                    predicted = np.round(model.predict(x_test_seq))
                    loss_and_metrics = classification_report(y_test, predicted, output_dict=True)
                    res = {
                        'ngrams': conv_gram,
                        'layers': conv_layer_number,
                        'dense': dense_neuron_number,
                        'activation': conv_activation,
                        'regularization': reg,
                        'result': loss_and_metrics 
                    }
                    res_arr.append(res)
                    print (str(counter) + ' / ' + str(total_combinations) + ' ready:')
                    print(res)
                    counter = counter + 1
            

In [None]:
from pandas import DataFrame
df = DataFrame(res_arr)
columns_to_extract = [ 'precision', 'recall','f1-score']
for column in columns_to_extract:
    df[column] = df['result'].apply(lambda x: x['weighted avg'][column])
df = df.drop(['result'], axis=1)
print(df)
res_str = df.to_csv(sep=';').replace('.', ',')
f = open("comparison_cnn.csv", "w")
f.write(res_str)
f.close()

In [None]:
import pandas
from io import StringIO
f = open("comparison_cnn.csv", "r")
raw_df = f.read().replace(',', '.')
df = pandas.read_csv(StringIO(raw_df), sep=';', index_col=False)
df = df.drop(['Unnamed: 0'], axis=1)
df['ngrams'] = df['ngrams'].apply(lambda x: str(x))
df['dense'] = df['dense'].apply(lambda x: str(x))
df['regularization'] = df['regularization'].apply(stringify)
print(df)
# res_str = df.to_csv(sep=';').replace('.', ',')
# f = open("comparison.csv", "w")
# f.write(res_str)
# f.close()

In [None]:
mean_activation = df.groupby('activation').mean().reset_index()
mean_layers = df.groupby('layers').mean().reset_index()
mean_dense = df.groupby('dense').mean().reset_index()
mean_ngrams = df.groupby('ngrams').mean().reset_index()
mean_regularization = df.groupby('regularization').mean().reset_index()
best_activation = df.groupby('activation').max().reset_index()
best_layers = df.groupby('layers').max().reset_index()
best_dense = df.groupby('dense').max().reset_index()
best_ngrams = df.groupby('ngrams').max().reset_index()
best_regularization = df.groupby('regularization').mean().reset_index()
mean_layers['layers'] = mean_layers['layers'].apply(lambda x: str(x))
best_layers['layers'] = best_layers['layers'].apply(lambda x: str(x))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 12))
plt.subplots_adjust(hspace=0.8)
def plot_grouped(df, group_column, xlabel, pos, ylim=(0.4, 0.83)):
    y, x, i = pos
    plt.subplot(y, x, i)
    plt.ylim(*ylim)
    
    plt.bar(df[group_column], df['f1-score'])
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel('F-мера', fontsize=12)
    plt.xticks(rotation=45, fontsize=12)












In [None]:
plot_grouped(best_activation, 'activation', 'Функция активации', (1,1,1), ylim=(0.69, 0.74))

In [None]:
plot_grouped(best_layers, 'layers', 'Количество сверточных слоев', (1,1,1), ylim=(0.71, 0.74))

In [None]:
plot_grouped(best_dense, 'dense', 'Высота полносвязного слоя', (1,1,1), ylim=(0.71,0.74))

In [None]:
plot_grouped(best_ngrams, 'ngrams', 'Максимальная высота фильтров', (1,1,1), ylim=(0.71, 0.74))

In [None]:
plot_grouped(best_regularization, 'regularization', 'Регуляризация', (1,1,1), ylim=(0.68, 0.72))

In [None]:
plot_grouped(best_classifiers, 'classifier', 'Классификаторы', (1,1,1),  ylim=(0., 0.83))

In [None]:
plt.show()