In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
df = pd.read_csv('/content/gdrive/My Drive/train_tweets.csv', encoding='utf-8')

In [0]:
df.shape

(7613, 5)

In [0]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [0]:
df = df.drop(['keyword', 'location'], axis=1)

In [0]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


Сделаем предобработку с помощью spacy

In [0]:
import spacy
nlp = spacy.load('en',disable=['parser', 'ner', 'textcat'])

In [0]:
import re
def reduce_to_double_max(text):
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

In [0]:
def preprocess_corpus(corpus):
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=4)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]

In [0]:
train_processed = preprocess_corpus(df['text'])
df['text'] = train_processed
df.head()

Unnamed: 0,id,text,target
0,1,-PRON- deed be the reason of this earthquake m...,1
1,4,forest fire near la ronge sask canada,1
2,5,all resident ask to shelter in place be be not...,1
3,6,people receive wildfire evacuation order in ca...,1
4,7,just get send this photo from ruby alaska as s...,1


Поделим на тестовую и тренировочную выборки

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.3, shuffle=True)

Векторизуем твиты с помощью Tf-idf Vectorizer

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.87,
               smooth_idf=1, max_features=300000)
#tf-idf
train_vecs =  vectorizer.fit_transform(x_train)
test_vecs = vectorizer.transform(x_test)

Перед нейронными сетями посмотрим на резульаьты классификатора - логистическая регрессия 

In [0]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=12, max_iter=10000, dual=False)
logreg.fit(train_vecs, y_train)

LogisticRegression(C=12, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, logreg.predict(test_vecs)))
print(f1_score(y_test, logreg.predict(test_vecs)))

0.7880910683012259
0.7452631578947368


Как мы видим, результаты не самые хорошие, поэтому переходим к более сложным методам классификации 

##NN

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [0]:
def build_model(input_shape):
    model = Sequential()
    model.add(Dense(128, activation='sigmoid', input_dim=input_shape))
    #model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
    return model

In [0]:
input_shape = train_vecs.shape[1]

Сначала применим модель Sequential к данным, репрезентированным путём tf-idf

In [0]:
model = build_model(input_shape)
model.fit(train_vecs, y_train,
          validation_data=(test_vecs, y_test),
                    epochs=10, batch_size=32)

Train on 5329 samples, validate on 2284 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc632e9eeb8>

Качество повысили, теперь оно около 0.8. Теперь попробуем применить ту же модель, но данные представим векторами с помощью модели w2v из модуля spacy

In [0]:
vecs = pd.DataFrame(index = df.index, columns = [d for d in range(96)])
vecs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
import numpy as np
for i, t in enumerate(df['text']): # для каждой строки в датафрейме
    t_vec = (nlp(' '.join(t))).vector # векторизуем текст
    if t_vec.shape == (0,):
        vecs.values[i] = np.array(0)*128 # заменяем строчки на наш вектор
    else:
        vecs.values[i] = t_vec

In [0]:
vecs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95
0,1.99564,-0.978775,0.713813,0.235606,1.35511,0.69002,1.5457,0.202649,2.2968,0.586369,0.488076,0.832369,-0.90528,-1.43158,0.645628,-1.58916,-1.15623,0.466831,-0.911409,-2.47672,2.24286,0.997092,-0.00785534,-1.03589,-1.93833,1.14485,-0.347724,-0.574203,3.21022,-1.04248,2.3887,-0.307255,-0.381557,-1.65607,-0.868504,-0.572451,2.18708,-1.16702,-2.04382,-0.745667,...,2.3166,0.558482,-1.32048,-0.0167134,0.10212,0.8885,0.330678,-0.405777,0.93787,0.106477,1.0593,0.0289169,-1.53197,0.60788,-1.88907,0.427591,-0.323255,2.31011,-1.81129,-0.361919,-1.34435,-0.00673329,-0.340062,-1.78884,0.886302,-2.38339,-1.19216,-0.114072,1.02176,1.01238,-0.217647,1.62149,-0.0227765,-0.806597,-1.54249,-0.456288,-0.212147,-0.562891,0.535796,1.66736
1,1.77575,-1.38535,0.935037,0.535596,1.62528,0.831153,1.25036,0.122188,2.04809,0.263226,0.867552,0.998177,-0.863411,-1.58543,0.429026,-1.84525,-1.40525,0.678945,-1.20128,-2.1458,1.80976,1.10952,-0.111805,-1.11255,-1.98509,1.23007,-0.665375,0.23013,3.15798,-1.41125,2.93151,-0.797871,-0.134435,-1.853,-0.689617,-0.414588,2.14406,-0.982691,-1.94573,-0.375268,...,2.15243,0.522655,-1.26447,-0.31518,-0.00361823,1.13445,0.760572,-0.536777,1.21942,0.326831,0.729753,-0.451411,-1.37693,0.254745,-2.38411,0.750957,-0.342307,2.03838,-2.26994,-0.204573,-1.50302,-0.456392,-0.346987,-2.20158,0.71643,-2.59417,-1.2974,-0.3372,1.10837,1.53513,-0.424577,1.45173,0.338848,-0.793298,-1.97475,0.170678,0.193809,-0.490569,0.157037,2.29135
2,2.1252,-1.0275,0.914514,0.179625,1.57458,0.585166,1.21476,0.294577,2.17648,0.654426,0.409453,0.954248,-0.93579,-1.37462,0.733018,-2.08135,-1.15565,0.866707,-0.8684,-2.50949,2.23843,1.01257,-0.267528,-1.18484,-1.78756,1.14193,-0.499169,-0.339401,3.42302,-0.900547,2.57165,-0.646779,-0.082643,-1.64714,-1.17846,-0.339277,2.06671,-1.25275,-1.94586,-0.60406,...,2.31459,0.65066,-1.39184,0.0802804,0.0675406,1.07979,0.190765,-0.42645,0.790521,0.247712,0.76131,-0.368171,-1.28154,0.586111,-2.28924,0.758565,0.136997,2.40459,-2.12212,-0.545368,-1.7053,0.2491,-0.141967,-2.27683,0.896544,-2.36395,-1.29055,-0.232844,1.41859,1.36877,-0.383987,1.38272,0.0458584,-0.839032,-1.78011,-0.100456,-0.259412,-0.558393,0.084925,1.94959
3,2.13461,-0.998269,0.870933,-0.096226,1.51231,0.52014,1.56677,0.371827,1.97877,0.630475,0.902592,1.0342,-1.22156,-1.09549,1.0742,-1.98272,-1.2661,1.43237,-0.907956,-2.4002,2.25001,0.900847,-0.682805,-1.39565,-2.0085,0.847057,-1.08428,-0.166518,3.55835,-0.626476,2.71963,-0.494648,-0.419137,-1.58559,-0.581029,-0.607018,1.753,-1.08057,-1.74407,-0.680055,...,2.10882,0.522664,-1.08204,0.208593,0.38456,1.10166,0.00540756,-0.414888,0.442331,0.787829,0.271429,-1.26031,-1.13339,0.611944,-2.23911,0.411871,-0.410887,2.64542,-2.08453,-0.716343,-1.58652,0.311111,-0.0296704,-2.68417,0.458213,-2.1725,-1.46998,-0.359542,1.35469,1.67333,0.306473,0.971012,0.0816446,-0.834373,-1.75675,-0.439515,-0.44193,-0.245959,0.313535,1.93752
4,1.97374,-0.904708,0.616136,0.347864,1.30481,0.831162,1.29754,0.156093,2.27763,0.597022,0.758626,0.685767,-0.862067,-0.939773,0.855174,-1.90946,-1.05183,0.946671,-1.16525,-2.43184,2.18085,0.72194,-0.119579,-1.01934,-1.89843,1.22694,-0.57683,0.0474901,3.45255,-1.14866,2.59992,-0.570179,-0.245786,-1.58028,-1.23558,-0.346025,2.07883,-1.0698,-1.99623,-0.595953,...,2.22834,0.524332,-1.2154,-0.25409,0.0725744,0.921236,0.345316,-0.235305,0.773363,0.300911,0.587272,-0.592632,-1.54162,0.464435,-2.35285,0.81057,-0.563864,1.861,-2.07841,-0.289784,-1.67409,-0.128999,-0.339822,-1.81982,0.807932,-2.39652,-1.17855,0.0619801,1.46991,1.38961,-0.0014042,1.40236,0.0483386,-0.850057,-1.89418,-0.158109,-0.0892788,-0.273811,0.0465797,2.13367


Так же разобьём на тренировочную и тестовую выборки 

In [0]:
x_train_wv, x_test_wv, y_train_wv, y_test_wv = train_test_split(vecs, df['target'], test_size=0.33, random_state=42)

In [0]:
input_shape_1 = x_train_wv.shape[1]

In [0]:
model_1 = build_model(input_shape_1)
model_1.fit(x_train_wv, y_train_wv,
          validation_data=(x_test_wv, y_test_wv),
                    epochs=10, batch_size=32)

Train on 5100 samples, validate on 2513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc625725278>

Качество стало намного хуже, переходим к сеткам посложнее и другой предобученной модели векторов 

#### для эмбеддингов

In [0]:
maxlen = 120
embedding_dim = 100
BatchSize = 100

Данных много - понадобиться генератор

In [0]:
from keras.utils import Sequence
from keras import layers
from keras import Model
class DataGenerator(Sequence):
    #Generates data for Keras
    def __init__(self, texts, labels, batch_size, embedding_dim, max_length, shuffle=False):
        self.labels = labels
        self.texts = texts
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.max_length = max_length
        self.shuffle = shuffle
        self.on_epoch_end()
    
    def __len__(self):
		#'Denotes the number of batches per epoch'
        return int(np.floor(len(self.texts) / self.batch_size))

    def __getitem__(self, index):
        #'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        temp_texts = [self.texts.iloc[k] for k in indexes]
        temp_labels = [self.labels.iloc[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(temp_texts, temp_labels)        
        return X, y
        
    def on_epoch_end(self):
        #'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.texts))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, temp_texts, temp_labels):
        #'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        # Создается тензор с объектами
        # Размерность: BATCH_SIZE x MAX_LENGTH x EMBEDDING_DIM
        X = np.zeros(shape=(self.batch_size, 
                            self.max_length, 
                            self.embedding_dim)) 
        y = np.zeros((self.batch_size), dtype=int) # Если класс кодируется числом, если в категориальной форма - то размер будет BATCH_SIZE x COUNT_OF_CLASSES
        # Generate data
        for instance_number in range(self.batch_size):
            try:
                for current_token in range(self.max_length):
                    if temp_texts[instance_number][current_token] in model.vocab:
                        X[instance_number,current_token,:] = model.get_vector(temp_texts[instance_number][current_token])# Эмбеддинг этого слова
            except IndexError:
                pass

            y[instance_number] = temp_labels[instance_number]
        return X, y

In [0]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Возьмём сверточную нейросеть с разным числом фильтров (2-5), также используем другую репрезентацию из модели glove-wiki-gigaword-100, вектора с размерностью 100

In [0]:
def CNNModel(
    maxlen, # Максимальная длина последовательности
    embedding_dim, # разбер эмбеддинга
):

  COUNT_OF_FILTERS = 256 # Возможно, придется подобрать...
  COUNT_OF_NEURONS = 128 # Возможно, придется подобрать...

  inputs = layers.Input(shape=(maxlen, embedding_dim))
  convs = []
  for kernel_size in [2,3,4,5]:
    c = layers.Conv1D(COUNT_OF_FILTERS, kernel_size=kernel_size, activation='relu')(inputs)
    c = layers.GlobalMaxPool1D()(c)
    convs.append(c)
  x = layers.Concatenate()(convs)
  x = layers.Dropout(0.2)(x)
  x = layers.Dense(COUNT_OF_NEURONS, activation = 'tanh')(x)
  x = layers.Dropout(0.3)(x)
  output = layers.Dense(5, 
                 activation = 'softmax' # или sigmoid? выберете сами
                 )(x)
  model = Model(inputs = inputs, outputs = output)
  return model

In [0]:
import numpy as np

training_generator = DataGenerator(texts=x_train, labels=y_train, batch_size=2, embedding_dim=100, max_length=120)
validation_generator = DataGenerator(texts=x_test, labels=y_test, batch_size=2, embedding_dim=100, max_length=120)

In [0]:
cnnm = CNNModel(120, 100)
cnnm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
cnnm.fit_generator(generator=training_generator,
                    validation_data=validation_generator, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2b5e01f278>

Качество, к сожалению, уменьшилось, макисмально здесь получилось - 0.5709