# Предобработка данных

In [1]:
from sklearn.model_selection import train_test_split
from keras import layers, models, optimizers, regularizers, applications
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
from gensim.models import KeyedVectors
from datetime import datetime, timedelta

glove_model = KeyedVectors.load_word2vec_format('word2vec_model.txt')

def embedding(x):
    try:
        vector = glove_model[str(x)]
        return list(vector)
    except Exception as e:
        return list(0*np.ones(300))

def ad_words(words, target_length):
    new_vector = np.zeros(300) 

    if len(words) > target_length:
        return words[:target_length]
    elif len(words) < target_length:
        while len(words) < target_length:
            words.append(new_vector)
    return words


In [None]:
df = pd.read_csv('res_df.csv')
images_array = np.load('images_array.npy')
df['images']= pd.Series([images_array[i] for i in range(images_array.shape[0])])

df['date'] = pd.to_datetime(df['date'])

one_week_earlier = datetime.now() - timedelta(days=2)
df = df[df['date'] < one_week_earlier]

base_model = applications.VGG16(weights='imagenet', include_top=False)
model = models.Model(inputs=base_model.input, outputs=base_model.output)

df['caption'] = df['caption'].astype(str)  
df['caption'] = df['caption'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x).lower() if x else '')
df['caption'] = df['caption'].apply(lambda x: x.split())
df['caption'] = df['caption'].apply(lambda x: ad_words(x, 10))
df['caption'] = df['caption'].apply(lambda x: [embedding(i) for i in x])

df = df.dropna(subset=['images'])  
df['images'] = df['images'].apply(lambda x: applications.vgg16.preprocess_input(np.expand_dims(x, axis=0)))
df['images'] = df['images'].apply(lambda x: model.predict(x)[0])
df['likes'] = df.apply(lambda row: row['likes'] / df[df['name'] == row['name']]['likes'].max(), axis=1)


# Модель и обучение

In [56]:
X = np.array(df['caption'].tolist())
X2 = np.array(df['images'].tolist())
y = np.array(df['likes'].tolist())

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X2_train, X2_temp = train_test_split(X2, test_size=0.2, random_state=42)
X2_val, X2_test = train_test_split(X2_temp, test_size=0.5, random_state=42)


if 1:
    model = models.Sequential([
    layers.LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, kernel_regularizer=regularizers.l2(0.02)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.LSTM(64, return_sequences=True, kernel_regularizer=regularizers.l2(0.02)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.LSTM(8, kernel_regularizer=regularizers.l2(0.02)),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.02)),
    layers.Dense(1, activation='linear'),
        ])
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-4), loss='mean_squared_error', metrics=['r2_score'])
    model.load_weights('model1.weights.h5')


if 0:
    model2 = models.Sequential([
    layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)),
    layers.Dropout(0.02),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)),
    layers.Dropout(0.02),
    layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)),
    layers.Dropout(0.02),

    layers.Conv2D(4, (3, 3), activation='relu', kernel_regularizer=l2(0.01)),
    layers.Flatten(),
    layers.Dense(1, activation='linear', kernel_regularizer=l2(0.01))  # Выходной слой для регрессии
    ])
    model2.compile(optimizer=optimizers.Adam(learning_rate=3e-4), loss='mean_squared_error', metrics=['r2_score'])
    #model2.load_weights('model2.weights.h5')

In [None]:
if 0:
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    epochs = 500
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=100, callbacks=[early_stopping], verbose=1)
    model.save_weights('model.weights.h5')
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')


if 0:
    early_stopping = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
    history2 = model2.fit(X2_train, y_train, validation_data=(X2_val, y_val), epochs=epochs, batch_size=100, callbacks=[early_stopping], verbose=1)
    model2.save_weights('model2.weights.h5')
    plt.plot(history2.history['loss'], label='Train Loss')
    plt.plot(history2.history['val_loss'], label='Val Loss')

In [57]:
def results_show(y, X, model, show_values):
    predictions = model.predict(X).reshape(len(X), -1)
    print(predictions.shape)
    if show_values:
        print("Predictions:")
        for i in range(len(predictions)):
            #print(f"Actual: {y_train[i]}, Predicted: {1000*predictions[i][0]}")
            print(f"difference - {y[i] - predictions[i]}")


    mse = mean_squared_error(y, predictions[:])
    mae = mean_absolute_error(y, predictions)
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')

results_show(y_test, X_test, model, 0)
results_show(y_test, X2_test, model2, 0)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step
(19, 1)
Mean Squared Error (MSE): 0.52
Mean Absolute Error (MAE): 0.65
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
(19, 1)
Mean Squared Error (MSE): 0.14
Mean Absolute Error (MAE): 0.33
