In [None]:
import os
import warnings

from PIL import Image
from pathlib import Path

import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import DataLoader

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, mean_squared_error

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords

from transformers import CLIPProcessor, CLIPModel

import glob

In [None]:
PATH = 'C:\\Users\\Rog\\Desktop\\Programing\\Yandex\\'
WORD_TO_BLOCK = ['child', 'boy', 'girl', 'baby', 'teen', 'teenager', 'kid', 'infant', 'youngster', 'kids', 'children', 'boys', 'girls', 'babies', 'teens', 'teenagers']

In [None]:
train_data = pd.read_csv(os.path.join(PATH, 'to_upload/train_dataset.csv'))
train_data.name = 'train_data'
data_crowd = pd.read_csv(os.path.join(PATH, 'to_upload/CrowdAnnotations.tsv'), sep='\t', names = ['image', 'query_id', 'fraction', 'pros', 'cons'])
data_crowd.name = 'data_crowd'
data_expert = pd.read_csv(os.path.join(PATH, 'to_upload/ExpertAnnotations.tsv'), sep='\t', names = ['image', 'query_id', 'first', 'second', 'third'])
data_expert.name = 'data_expert'
test_query = pd.read_csv(os.path.join(PATH, 'to_upload/test_queries.csv'), index_col=[0], sep='|')
test_query.name = 'test_query'
test_image = pd.read_csv(os.path.join(PATH, 'to_upload/test_images.csv'))
test_image.name = 'test_image'

In [None]:
for dataframe in [train_data, data_crowd, data_expert, test_query, test_image]:
    print('------------------------------------------------------------------------------------------------------------')
    print('+-------------+')
    print(f'| {dataframe.name}  |')
    print('+-------------+')
    print(dataframe.shape)
    display(dataframe.head(5))
    print()

In [None]:
samples_train = list(train_data['image'].sample(8))
samples_test = list(test_query['image'].sample(8))

fig = plt.figure(figsize=(10,10))
for i in range(8):
    fig.add_subplot(4, 4, i+1)
    image = Image.open(Path(PATH, 'to_upload/train_images', samples_train[i]))
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
    plt.tight_layout()
for i in range(8):
    fig.add_subplot(4, 4, i+9)
    image = Image.open(Path(PATH, 'to_upload/test_images', samples_test[i]))
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
    plt.tight_layout()

In [None]:
print(f"Количество уникальных изображений в тренировочной выборке: {len(train_data.image.unique())}")
print(f"Количество уникальных зфпросов в тренировочной выборке: {len(train_data.query_text.unique())}")
print(f"Количество уникальных изображений в тестовой выборке: {len(test_image.image.unique())}")
print(f"Количество уникальных зфпросов в тестовой выборке: {len(test_query.query_text.unique())}")


In [None]:
crowd_analicit = data_crowd.copy()
crowd_analicit['fraction'] *= 100
crowd_analicit['fraction'] = crowd_analicit['fraction'].astype('int')
crowd_array=[]
fraction_nums = np.array([0, 33, 66, 100])
for i in fraction_nums:
    crowd_array.append((crowd_analicit['fraction'] == i).sum())

plt.figure(figsize=(10, 8))
sns.barplot(x = fraction_nums, y = crowd_array)
plt.xticks(rotation=75, size=12)
plt.xlabel('Доля людей, подтвердивших, что описание соответствует изображению (%)')
plt.ylabel('Количество')
plt.show()

In [None]:
def transform_range(x):
    return round((x - 1) / 3.0, 2)

def agr_data(row):
    if row['first'] != row['second'] != row['third']:
        row['agr_expert'] = (row['first'] + row['second'] + row['third']) // 3
    else:
        row['agr_expert'] = int(np.median(row['first':'third']))

    return row

data_expert = data_expert.apply(agr_data, axis=1)
data_expert

In [None]:
data_merged = data_expert.merge(data_crowd, on = ['image', 'query_id'], how='outer')
data_merged

In [None]:
warnings.filterwarnings('ignore')

print('Количество NaN значений после объеденений массивов в agr_expert:', data_merged.isna().sum()[2])
print('Количество NaN значений после объеденений массивов в fraction:', data_merged.isna().sum()[3])
print()
print('Матрица корреляции agr_expert и fraction:')
display(data_merged[['agr_expert', 'fraction']].corr())

In [None]:
agr_expert_unique = data_merged.agr_expert.unique()
agr_expert_unique

In [None]:
data_merged_notnan = data_merged.copy()
data_merged_notnan = data_merged_notnan.dropna()

for i in agr_expert_unique:
    print(f'Средняя доля людей, подтвердивших, что описание соответствует изображению (%) при оценке экспертов равной {i}:',
          '{:.0%}'.format(data_merged_notnan.loc[data_merged_notnan['agr_expert'] == i, 'fraction'].mean()),
          ', std:',
          '{:.0%}'.format(data_merged_notnan.loc[data_merged_notnan['agr_expert'] == i, 'fraction'].std()))

print()
print('Подсчёт уникальных значений столбца agr_expert')
print(data_merged_notnan.agr_expert.value_counts())
print()
print('Подсчёт уникальных значений столбца fraction')
print(data_merged_notnan.fraction.value_counts())

In [None]:
def agr_merge(row):
    if row['fraction'] >= 0.75:
        return 4.0
    elif row['fraction'] >= 0.25:
        return 3.0
    elif row['fraction'] >= 0.10:
        return 2.0
    return 1.0

In [None]:
features_func = data_merged[['agr_expert', 'fraction']].copy()
features_func.dropna(inplace=True)
features_func['predict'] = features_func.apply(agr_merge, axis=1)

agr_expert_models = pd.DataFrame(columns = ['ACC'])

def alter_table(df, model_name, acc):
    '''Добавляет в нашу таблицу с анализом моделей данные'''

    agr_expert_models.loc[model_name, 'ACC'] = acc

    return df


alter_table(agr_expert_models,
            'BaseLine',
            round(accuracy_score(features_func['predict'], features_func['agr_expert']), 4))

In [None]:
data_merged_expert = data_merged.copy()
data_merged_expert.dropna(inplace=True)

features = data_merged_expert[['fraction', 'pros', 'cons']]
target = data_merged_expert.agr_expert

print(features.shape)
print(target.shape)

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target,
                                                                            test_size=0.25,
                                                                            random_state=12345)

In [None]:
model_lr = LogisticRegression(class_weight="balanced", random_state=12345).fit(features_train, target_train)
predictions = model_lr.predict(features_test)

alter_table(agr_expert_models,
            'LogisticRegression',
            round(accuracy_score(predictions, target_test), 4))

In [None]:
model_svc = SVC(class_weight="balanced", kernel = 'sigmoid').fit(features_train, target_train)
predictions = model_svc.predict(features_test)

alter_table(agr_expert_models,
            'SVC',
            round(accuracy_score(predictions, target_test), 4))

In [None]:
model_rfc = RandomForestClassifier(class_weight="balanced").fit(features_train, target_train)
predictions = model_rfc.predict(features_test)

alter_table(agr_expert_models,
            'RandomForestClassifier',
            round(accuracy_score(predictions, target_test), 4))

In [None]:
agr_expert_models.sort_values(by='ACC', ascending = False)

In [None]:
fraction_models = pd.DataFrame(columns = ['RMSE'])

def alter_table(df, model_name, rmse):
    fraction_models.loc[model_name, 'RMSE'] = rmse

    return df

In [None]:
features = data_merged_expert[['first', 'second', 'third', 'agr_expert']]
target = data_merged_expert.fraction

print(features.shape)
print(target.shape)

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target,
                                                                            test_size=0.25,
                                                                            random_state=12345)

In [None]:
model_lr = LinearRegression().fit(features_train, target_train)
predictions = model_lr.predict(features_test)

alter_table(alter_table,
            'LinearRegression',
            round(np.sqrt(mean_squared_error(target_test, predictions)), 4))
fraction_models

In [None]:
model_r = Ridge().fit(features_train, target_train)
predictions = model_r.predict(features_test)

alter_table(alter_table,
            'Ridge',
            round(np.sqrt(mean_squared_error(target_test, predictions)), 4))
fraction_models

In [None]:
model_rfr = RandomForestRegressor().fit(features_train, target_train)
predictions = model_rfr.predict(features_test)

alter_table(alter_table,
            'RandomForestRegressor',
            round(np.sqrt(mean_squared_error(target_test, predictions)), 4))
fraction_models.sort_values(by='RMSE')

In [None]:
grid_params = {'n_estimators': np.arange(50, 110, 10),
               'max_depth': np.arange(9, 16),
               'criterion': ['squared_error', 'absolute_error']}

grid = GridSearchCV(RandomForestRegressor(n_jobs=6), grid_params, cv=3, verbose=1, error_score='raise', n_jobs=2).fit(features_train, target_train)

model_cv = grid.best_estimator_

In [None]:
predictions = model_cv.predict(features_test)
print(np.sqrt(mean_squared_error(target_test, predictions)))

In [None]:
data_merged

In [None]:
def fill_nan(row):
    if np.isnan(row['agr_expert']):
        a = model_svc.predict([[row['fraction'], row['pros'], row['cons']]])[0]
        row['agr_expert'] = a
    elif np.isnan(row['fraction']):
        b = np.round((model_cv.predict([[row['first'], row['second'], row['third'], row['agr_expert']]]))[0], 6)
        row['fraction'] = b

    return row

data_merged = data_merged.apply(fill_nan, axis=1)
data_merged

In [None]:
data_merged.shape[0]

In [None]:
for i in range(data_merged.shape[0]):
    data_merged.iloc[i, data_merged.columns.get_loc('agr_expert')] = round((data_merged.iloc[i]['agr_expert'] - 1) / 3.0, 2)

data_merged

In [None]:
data_merged.agr_expert.unique()

In [None]:
data_merged.dropna(axis=1, inplace=True)
data_merged

In [None]:
def scoring(row):
    row['score'] = row['agr_expert'] * 0.75 + row['fraction']*0.25
    return row

data_merged = data_merged.apply(scoring, axis=1)
display(data_merged)
display(data_merged.describe())

In [None]:
train_data = pd.merge(train_data, data_merged[['image', 'query_id', 'score']], how='outer', on=['image', 'query_id'])

In [None]:
train_data.info()

In [None]:
notna_train = train_data[train_data['query_text'].notna()]
notna_train

In [None]:
idx = 5855
print(train_data.iloc[idx])
qid = train_data.iloc[idx]['query_id']
notna_train.loc[notna_train['query_id'] == qid]['query_text']

In [None]:
def fillna_train(row):

    if pd.isnull(row['query_text']):
        text = notna_train[notna_train['query_id'] == row['query_id']]['query_text']
        if len(text) != 0:
            row['query_text'] = text.iloc[0]
    return row

train_data = train_data.apply(fillna_train, axis=1)

In [None]:
train_data.info()

In [None]:
train_data.dropna(inplace=True)
display(train_data.head(10))
train_data.info()

In [None]:
train_data.iloc[5682]['query_text']

In [None]:
l = WordNetLemmatizer()
stop_words = nltk_stopwords.words("english")
def lemmatize_text(phrase):

    phrase = re.sub('[^a-zA-Z]', ' ', phrase).lower()
    phrase = nltk.word_tokenize(phrase, language = 'english')
    phrase = [l.lemmatize(i, pos="n") for i in phrase]
    phrase = [l.lemmatize(i, pos="v") for i in phrase]
    phrase = [l.lemmatize(i, pos="a") for i in phrase]
    phrase = [l.lemmatize(i, pos="r") for i in phrase]
    phrase = [l.lemmatize(i, pos="s") for i in phrase]
    phrase = " ".join([i for i in phrase if i not in stop_words])
    return phrase


def blocking(row):

    phrase = lemmatize_text(row['query_text'])
    bin_array = [i for i in phrase.split() if i in WORD_TO_BLOCK]
    if bin_array:
        row['word_to_block'] = 1
    else:
        row['word_to_block'] = 0

    return row

train_data = train_data.apply(blocking, axis=1)

In [None]:
train_data = train_data.loc[train_data['word_to_block'] == 0].reset_index(drop=True)
train_data.drop(columns=['word_to_block'], inplace=True)
display(train_data.head())
train_data.shape[0]

In [None]:
train_data = train_data.reset_index(drop=True)
copy_train_data = train_data.copy()
copy_test_data = test_query.copy()
copy_test_data["score"] = 1
copy_train_data , copy_val_data = train_test_split(copy_train_data, test_size=0.33, random_state=12345)

In [None]:
random_state = 12345
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(random_state)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
])

In [None]:
copy_train_data["query_text"] = copy_train_data["query_text"].apply(lambda x: lemmatize_text(x))
copy_test_data["query_text"] = copy_test_data["query_text"].apply(lambda x: lemmatize_text(x))

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(copy_train_data["query_text"])

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, path=None, train=True):
        self.train = train

        if self.train:
            self.images = data["image"]
            self.description = data["query_text"]
            self.target = data["score"]
            self.transform = transform
            self.path = path

        else:
            self.description = data["query_text"]


    def __len__(self):
        if self.train:
            return len(self.images)
        else:
            return len(self.description)

    def __getitem__(self, idx):
        if self.train:
            img_name = self.images.iloc[idx]
            image = Image.open(f"{self.path}{img_name}")

            with torch.no_grad():
                tokens = torch.tensor(pad_sequences(tokenizer.texts_to_sequences([self.description.iloc[idx]]), maxlen=20)[0]).to(device)
                target = torch.tensor(1 if self.target.iloc[idx] > 0.8 else -1, dtype=torch.float32).to(device)

            if self.transform:
                image = self.transform(image).to(device)

            return image, tokens, target

        else:
            tokens = torch.tensor(pad_sequences(tokenizer.texts_to_sequences([self.description.iloc[idx]]), maxlen=20)[0]).to(device)
            return tokens

In [None]:
train_data_set = CustomDataset(copy_train_data, "to_upload/train_images/")
train_data_loader = DataLoader(train_data_set, batch_size=250, shuffle=True)
test_data_set = CustomDataset(copy_test_data, "to_upload/test_images/")
test_data_loader = DataLoader(test_data_set, batch_size=500, shuffle=False)
val_data_set = CustomDataset(copy_val_data, "to_upload/train_images/")
val_data_loader = DataLoader(val_data_set, batch_size=250, shuffle=False)

In [None]:
vocub_size = len(tokenizer.word_counts) + 1

In [None]:
class LSTM_1 (nn.Module):
    def __init__(self):
        super(LSTM_1, self).__init__()

        self.embbedings =nn.Embedding(vocub_size, 1024)
        self.lstm = nn.LSTM(input_size=1024, hidden_size=512, num_layers=5, batch_first=True)

    def forward(self, x) -> torch.tensor:
        x = self.embbedings(x)
        x, _ = self.lstm(x)
        return x[:, -1, :]

In [None]:
class Conv_1(nn.Module):
    def __init__(self) -> None:
        super(Conv_1, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=24, kernel_size=3)
        self.max1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(in_channels=24, out_channels=16, kernel_size=3)
        self.max2 = nn.MaxPool2d(2)
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=8, kernel_size=3)
        self.max3 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(5408, (5408 // 3) * 2)
        self.fc2 = nn.Linear((5408 // 3) * 2, 5408 // 3)
        self.fc3 = nn.Linear(5408 // 3, 512)

    def forward(self, x) -> torch.tensor:
        x = self.conv1(x)
        x = self.max1(x)
        x = self.conv2(x)
        x = self.max2(x)
        x = self.conv3(x)
        x = self.max3(x)
        x = self.fc1(x.flatten(1))
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [None]:
conv_1 = Conv_1().to(device)
lstm_1 = LSTM_1().to(device)
optimazer_conv_1 = torch.optim.Adam(conv_1.parameters(), lr=1e-4)
optimazer_lstm_1 = torch.optim.Adam(lstm_1.parameters(), lr=1e-4)
citeration = nn.CosineEmbeddingLoss().to(device)

In [None]:
num_epochs = 5

for _ in range(num_epochs):
    total_loss = 0
    eval_loss = 0
    conv_1.train()
    lstm_1.train()

    for image, texts, target in train_data_loader:
        optimazer_conv_1.zero_grad()
        optimazer_lstm_1.zero_grad()

        lstm_outputs = lstm_1(texts)
        cnn_outputs = conv_1(image)

        loss = citeration(cnn_outputs, lstm_outputs, target)
        loss.backward()

        optimazer_conv_1.step()
        optimazer_lstm_1.step()

        total_loss += loss.item()

    conv_1.eval()
    lstm_1.eval()

    for image, texts, target in val_data_loader:
        with torch.no_grad():
            lstm_outputs = lstm_1(texts)
            cnn_outputs = conv_1(image)

        loss = citeration(cnn_outputs, lstm_outputs, target)

        eval_loss += loss.item()

    avarage_eval_loss = eval_loss / len(val_data_loader)
    average_loss = total_loss / len(train_data)
    print(f'Epoch [{_+1}/{num_epochs}], Loss: {average_loss:.4f}, Eval: {avarage_eval_loss:.4f}')

conv_1.eval()
lstm_1.eval()

In [None]:
for images, words, targets in test_data_loader:
    results_images = conv_1(images).to("cpu").detach().numpy()
    results_sequences = lstm_1(words).to("cpu").detach().numpy()
    resulted_data_frame_images = pd.DataFrame(results_images)

resulted_data_frame_images =  resulted_data_frame_images.drop_duplicates()
resulted_indexies_images = list(resulted_data_frame_images.index)
resulted_data_frame_images =  resulted_data_frame_images.reset_index(drop=True)
resulted_data_frame_images["image"] = copy_test_data.iloc[resulted_indexies_images]["image"].to_list()

In [None]:
def test(text:[str], text_model):
    title = text[0]
    text = pd.DataFrame(text, columns=["query_text"])
    text["query_text"] = text["query_text"].apply(lambda x: lemmatize_text(x))
    text = CustomDataset(text, train=False)
    text = iter(DataLoader(text, batch_size=1))
    text = text_model(next(text)).to("cpu").detach().numpy()[0].reshape(1, -1)

    similarity = []

    data = resulted_data_frame_images.copy()

    for i in data.drop("image", axis=1).to_numpy():
        i = i.reshape(1, -1)
        sim = cosine_similarity(text, i)[0, 0]
        similarity.append(sim)

    data["similarity"] = similarity
    data = data.sort_values("similarity").reset_index(drop=True)
    images = data["image"].tail(5).to_list()
    similarity = data["similarity"].tail(5).to_list()

    fig, axs = plt.subplots(4, 4, figsize=(10, 10))
    fig.suptitle(title)
    min_len = 5

    for i, ax in enumerate(axs.flatten()):
        if i < min_len:
            image = Image.open(f'to_upload/test_images/{images[i]}')
            ax.imshow(image)
            ax.set_title(f"{similarity[i]:0.4f}")
            ax.axis('off')
        else:
            ax.axis('off')


    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

In [None]:
test_query["query_text"].unique()

In [None]:
test(["The woman lacrosse player in blue is about to catch the ball ."], lstm_1)

In [None]:
test(["A sad looking dog sitting next to shrubs"], lstm_1)

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
for i in range(16):
    text = test_query.loc[i]["query_text"]
    for g in WORD_TO_BLOCK:
            if g in lemmatize_text(text):
                print("This image is not avalible")
                break
    else:
        with torch.no_grad():
            image = Image.open(f"to_upload/test_images/{test_query.loc[i]['image']}")
            inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=1)
            _class = int(probs[0][0])

            plt.title(f"Title: {text}\nClass: {_class}")

            plt.imshow(image)
            plt.axis('off')

            plt.show()
