<a href="https://colab.research.google.com/github/Guiillotine/SoftwareEngineering/blob/kozlova/Twitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd # Для чтения CSV
from sklearn import feature_extraction, linear_model, model_selection, preprocessing # !!!!

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
import nltk
import re
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
nltk.download("stopwords")
wordnet = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Загрузка данных

Загрузка нужных для обучения файлов:

In [None]:
def readCsvByLink(url):
  url='https://drive.google.com/uc?id=' + url.split('/')[-2] # Извлекаем id файла
  return pd.read_csv(url)

In [None]:
train_df = readCsvByLink("https://drive.google.com/file/d/10_zwLLKTklGvnZpTXJYWouPmGmVt5Je_/view?usp=drive_link")
test_df = readCsvByLink("https://drive.google.com/file/d/17epsBjuyGCkBbNMtyyOo6DJTfeTW-am_/view?usp=drive_link")

In [None]:
f"Постов для обучения: {train_df.shape[0]}, для тестирования: {str(test_df.shape[0])}"

'Постов для обучения: 7613, для тестирования: 3263'

In [None]:
train_df[["text", "target"]]

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [None]:
train_df[["text", "target"]]

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [None]:
test_df["text"]

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


# Очистка данных

In [None]:
def clean(textStrs):
    cleanedStrs=[]
    for textStr in textStrs:
        cleanedStr = re.sub(r'[^a-zA-Z]', ' ', textStr)
        cleanedStr = cleanedStr.lower()
        cleanedStr = cleanedStr.split(" ")
        cleanedStr = [item for item in cleanedStr if item!=""]
        cleanedStr = [wordnet.lemmatize(word) for word in cleanedStr if not word in stopwords.words('english') and len(word) > 2]
        cleanedStr = " ".join(cleanedStr)
        cleanedStrs.append(cleanedStr)
    return cleanedStrs

In [None]:
x_train_text = clean(train_df['text'])

In [None]:
x_train_text[:10]

['deed reason earthquake may allah forgive',
 'forest fire near ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 'people receive wildfire evacuation order california',
 'got sent photo ruby alaska smoke wildfire pours school',
 'rockyfire update california hwy closed direction due lake county fire cafire wildfire',
 'flood disaster heavy rain cause flash flooding street manitou colorado spring area',
 'top hill see fire wood',
 'emergency evacuation happening building across street',
 'afraid tornado coming area']

# Подготовка данных

In [None]:
num_words = 10000  # Число слов в словаре ограничим 10000
max_post_len = 200 # Максимальная длина поста

In [None]:
y_train = np.asarray(train_df["target"])
x_test_text = np.asarray(test_df["text"])

Токенизация

In [None]:
tokenizer = Tokenizer(num_words = num_words)

In [None]:
tokenizer.fit_on_texts(x_train_text) # Обучение токенизатора + построение словаря по текстам из обуч выборки

In [None]:
# 20 наиболее часто встерчающихся слов и соответствующие им номера
list(tokenizer.word_index.items())[:20]

[('http', 1),
 ('fire', 2),
 ('like', 3),
 ('amp', 4),
 ('get', 5),
 ('new', 6),
 ('via', 7),
 ('news', 8),
 ('one', 9),
 ('people', 10),
 ('time', 11),
 ('year', 12),
 ('video', 13),
 ('disaster', 14),
 ('emergency', 15),
 ('body', 16),
 ('day', 17),
 ('home', 18),
 ('police', 19),
 ('building', 20)]

Текст - в числовое представление

In [None]:
x_train_seq = tokenizer.texts_to_sequences(x_train_text)

Пример числового представления строки

In [None]:
print(x_train_seq[0],"\n",x_train_text[0])

[3883, 448, 157, 65, 1357, 3884] 
 deed reason earthquake may allah forgive


Приведение постов к одной длине (кол-ву токенов)

In [None]:
x_train = pad_sequences(x_train_seq, maxlen=max_post_len)

In [None]:
# Данная функция объединяет в себе вышеописанные этапы преобразования
# текста к подходящему для обучения нейронной сети виду,
# для удобства приведения тестовых данных у нужному виду
def convertTextToNumberic(x_texts, tokenizer, max_text_len):
  x_seq = tokenizer.texts_to_sequences(x_train_text)
  return pad_sequences(x_seq, maxlen=max_text_len)

# Создание модели

In [None]:
model = Sequential() # Последовательная модель
model.add(Embedding(num_words, 64, input_length=max_post_len))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))



In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Обучение модели

Защита от переобучения: модель с наилучшими результатами тестирования сохраним в файл 'best_model.keras'

In [None]:
model_save_path = 'best_model.keras'
# callback - вызывается на каждой эпохе
checkpoint_callback = ModelCheckpoint(model_save_path,
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
history = model.fit(x_train, y_train,
                    epochs=5,
                    batch_size=128, #lll
                    validation_split=0.1,
                    callbacks=[checkpoint_callback])

Epoch 1/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 630ms/step - accuracy: 0.5955 - loss: 0.6548
Epoch 1: val_accuracy improved from -inf to 0.75853, saving model to best_model.keras
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 689ms/step - accuracy: 0.5966 - loss: 0.6541 - val_accuracy: 0.7585 - val_loss: 0.5129
Epoch 2/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670ms/step - accuracy: 0.8408 - loss: 0.3801
Epoch 2: val_accuracy improved from 0.75853 to 0.77822, saving model to best_model.keras
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 719ms/step - accuracy: 0.8409 - loss: 0.3800 - val_accuracy: 0.7782 - val_loss: 0.4492
Epoch 3/5
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 625ms/step - accuracy: 0.8980 - loss: 0.2633
Epoch 3: val_accuracy did not improve from 0.77822
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 648ms/step - accuracy: 0.8980 - loss: 0.2631 - v

Загрузим наилучшую модель

In [None]:
model.load_weights(model_save_path)

# Проверка на тестовых данных

In [None]:
x_test = convertTextToNumberic(x_test_text, tokenizer, max_post_len)

In [None]:
# Предсказание на тестовых данных
predictions = model.predict(x_test)

# Преобразование предсказаний в бинарные значения
binary_predictions = (predictions > 0.5).astype(int)

[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 107ms/step


In [None]:
for i in range(10,30):
  print(x_test_text[i], "->", "Настоящее бедствие" if binary_predictions[i] else "Фейк")

No I don't like cold! -> Настоящее бедствие
NOOOOOOOOO! Don't do that! -> Настоящее бедствие
No don't tell me that! -> Настоящее бедствие
What if?! -> Настоящее бедствие
Awesome! -> Настоящее бедствие
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU -> Фейк
@sunkxssedharry will you wear shorts for race ablaze ? -> Фейк
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI -> Фейк
Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw -> Фейк
PSA: IÛªm splitting my personalities.

?? techies follow @ablaze_co
?? Burners follow @ablaze -> Фейк
beware world ablaze sierra leone &amp; guap. -> Фейк
Burning Man Ablaze! by Turban Diva http://t.co/hodWosAmWS via @Etsy -> Фейк
Not a diss song. People will take 1 thing and run with it. Smh it's an eye opener though. He is about 2 set the game ablaze @CyhiThePr