<a href="https://colab.research.google.com/github/MrFzovpec/mettre-marketing/blob/master/marketing_analysis/instagram/Instagramm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

# Organizing the data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/MrFzovpec/mettre-marketing/master/marketing_analysis/instagram/instagram.csv')

In [3]:
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 
                 'Unnamed: 0.1.1.1.1', 'Unnamed: 0.1.1.1.1.1', 'Unnamed: 0.1.1.1.1.1.1'])

In [4]:
df = df.dropna()

In [5]:
df = df.drop_duplicates()

In [6]:
df['index_account'] = None

The accounts are depersonalized, so we need to indexate them

In [7]:
index_acc = 0

In [8]:
for x in df.iterrows():
  if x[0] == 0:
    df['index_account'][x[0]] = index_acc
    previous = x[1]['account_description']
    continue
  if x[1]['account_description'] == previous:
    df['index_account'][x[0]] = index_acc
  else:
    index_acc += 1
    df['index_account'][x[0]] = index_acc

  previous = x[1]['account_description']

In [9]:
df.head()

Unnamed: 0,total_posts,text,likes,date,subscribers,subscribed,image_urls,account_description,index_account
0,155,here’s my fire/water girl oc w butterfly sleev...,42103,2020-06-11T20:32:23.000Z,286674,397,https://instagram.fbru2-1.fna.fbcdn.net/v/t51....,🎨 repost with credit!\n🍉 clip studio paint\n🍡 ...,0
1,155,here’s a tablet vs phone challenge bc i haven’...,76906,2020-06-09T20:35:07.000Z,286674,397,https://instagram.fbru2-1.fna.fbcdn.net/v/t51....,🎨 repost with credit!\n🍉 clip studio paint\n🍡 ...,0
2,155,here’s a sailor moon mermaid! ✨🌊🌊 i had fun dr...,42219,2020-06-05T20:50:10.000Z,286674,397,https://instagram.fbru2-1.fna.fbcdn.net/v/t51....,🎨 repost with credit!\n🍉 clip studio paint\n🍡 ...,0
3,155,here’s a support post for black artists and cr...,81812,2020-06-03T19:20:01.000Z,286674,397,https://instagram.fbru2-1.fna.fbcdn.net/v/t51....,🎨 repost with credit!\n🍉 clip studio paint\n🍡 ...,0
4,155,a milk carton vending machine! decorate the co...,37869,2020-05-28T20:37:42.000Z,286674,397,https://instagram.fbru2-1.fna.fbcdn.net/v/t51....,🎨 repost with credit!\n🍉 clip studio paint\n🍡 ...,0


# Creating a dataloader

This is going to be an LSTM model and it's going to work with the five previous examples. So, for example I wanna predict likes countable for some particular post, then I'm going to take 4 previous posts and make a prediction basing on their data. <br> <br>
Here I'll create a few classes which would provide that functionality

In [None]:
class DatasetSamples():
  ''' This class will return a dataset samples in the format of 5 posts 
  (more or less). It's going to be kind of a sliding window'''
  def __init__(self, df, window_size=5):
    self.df = df
    self.window_size = window_size

  def get_window_of_posts(self):
    users = df['index_account'].unique()

    for user in users:
      user = int(user)
      user_df = df[df['index_account'] == user]
      user_df_len = len(user_df)
      user_posts_array = []

      # Identifying the indexes we're going to use to parse
      starts_index = 0
      final_index = user_df_len - self.window_size

      for index in range(starts_index, final_index):
        # Sometimes account doesn't have even 5 posts
        if len(user_df.iloc[index: index + self.window_size]) == 0:
          continue

        yield user_df.iloc[index: index + self.window_size]

In [None]:
ds = DatasetSamples(df)

In [None]:
ds = ds.get_window_of_posts()

The class above just samples a data with some particular window size and returns it in the format of generator of array of the posts (pandas df)

The following class is encoding the text and creating a tensor out of it

In [None]:
from transformers import DistilBertTokenizer

In [None]:
import torch.nn.functional as F

In [None]:
import torch

In [None]:
class TextEncoder():
  def __init__(self, tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')):
    self.tokenizer = tokenizer

  def encode(self, samples):
    text_array = [] # creating a text array for keeping all the texts
    for i, sample in samples.iterrows():
      text_tensor = torch.tensor(self.tokenizer.encode(sample[list(samples.columns)[0]]))
      text_array.append(text_tensor)

    return self.pad_and_stack(text_array)
  
  @staticmethod
  def get_largest_elem(array, dim=0):
    ''' The function identyfies the largest tensor over particular dimension '''
    max_len = 0
    for elem in array:
      # Runs over thought the array to identify the largest one
      if elem.shape[dim] > max_len:
        max_len = elem.shape[dim]

    return max_len

  def pad_and_stack(self, array, dim=0):
    ''' Function pads and stacks array over a new axis '''

    if dim == 0:
      largest = self.get_largest_elem(array) # gets the largest to pad
      array_for_stack = []
      for elem in array:
        # Pad the elements to get equal shapes
        elem = F.pad(elem, [0, largest - elem.shape[dim]])
        array_for_stack.append(elem)

    return torch.stack(array_for_stack)
      

The following class is the class which's going to encode images out of the link

In [15]:
from PIL import Image

In [19]:
import requests

In [20]:
from io import BytesIO

In [31]:
from torchvision import transforms

In [34]:
import matplotlib.pyplot as plt

In [16]:
class ImageEncoder():
  def __init__(self, size_index=2, transform=transforms.ToTensor()):
    self.size_index = size_index
    self.transform = transform

  def encode(self, samples):
    image_array = [] # creating an array for keeping all the images
    for i, sample in samples.iterrows():
      link_orig_href = sample[list(samples.columns)[0]]
      # Getting a clear link of an image
      link_href = link_orig_href.split(',')[self.size_index][:-5]
      image = self.get_img_from_remote_server(link_href)
      image = self.transform(image)
      image_array.append(image)
    
    return torch.stack(image_array)

  @staticmethod
  def get_img_from_remote_server(url):
    ''' Function gets an image from a remote server '''
    response = requests.get(url)
    return Image.open(BytesIO(response.content))

The following class is going to manage the data and give the final one

In [None]:
from torch.utils.data import Dataset

In [None]:
class DatasetManager(Dataset):
  def __init__(self, generator, text_encoder, image_encoder, meta_data_encoder,
               date_encoder):
    super().__init__()
    self.generator = generator
    self.data = [sample for sample in self.generator]

    # Encoders for different data
    self.text_encoder, self.account_description_encoder = TextEncoder()
    self.image_encoder = ImageEncoder()
    self.date_encoder = DateEncoder()
    self.likes_encoder, self.comments_encoder, self.total_posts_encoder,
    self.subscribers_encoder, self.subscribed_encoder = MetaDataEncoder()

  def __getitem__(self, index):
    sample_data = self.data[index]

    return {
        'total_posts': self.total_posts_encoder.encode(sample_data['total_posts']),
        'text': self.text_encoder.encode(sample_data['text']),
        'likes': self.likes_encoder.encode(sample_data['likes']),
        'date': self.date_encoder.encode(sample_data['date']),
        'image': self.image_encoder.encode(sample_data['image_urls']),
        'subscribers': self.subscribers_encoder(sample_data['subscribers']),
        'subscribed': self.subscribed_encoder.encode(sample_data['subscribed']),
        'account_description': self.account_description_encoder.encode(sample_data['account_description']),
    }