# In this notebook we explore the dataset and try to find out what the data can tell us

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
np.random.seed(42)
import os
import pickle
import collections
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
import re
from tqdm import notebook

Reading the files

In [2]:
IMAGES_DIR = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/images'
image_filenames = os.listdir(IMAGES_DIR)
file_extentions = [filename.split('.')[-1] for filename in image_filenames]

images_paths = [os.path.join(IMAGES_DIR,filename) for filename in image_filenames]

REF_FILE = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/reference_df_pickle'
LABELS_FILE = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/labels_pd_pickle'

with open(REF_FILE, 'rb') as handle:
    reference_df_ = pickle.load(handle)

with open(LABELS_FILE, 'rb') as handle:
    labels_pd_ = pickle.load(handle)
    


## First off, inspecting the file extentions, we can see that the memes are mostly in _jpg_ format.<br> But there are some other formats as well

In [3]:

image_formats = collections.Counter(file_extentions)
print(f'Num Images: {len(images_paths)}')

print('Image formats found: ', image_formats)
image_formats_df = pd.DataFrame.from_dict(image_formats, orient='index').reset_index()
image_formats_df

Num Images: 6992
Image formats found:  Counter({'jpg': 4949, 'png': 1675, 'jpeg': 345, 'JPG': 16, 'PNG': 4, 'bmp': 2, 'jpe': 1})


Unnamed: 0,index,0
0,jpg,4949
1,png,1675
2,jpeg,345
3,JPG,16
4,PNG,4
5,jpe,1
6,bmp,2


## Lets open up the given columns in the given pickle

In [4]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

In [5]:
def get_train_val_split(train_frac, df, id_col):
    """
    Splits dataframe into train and val keeping percentage of
    labels same in both splits.
    Args:
        train_frac: Fraction of samples to use for train
        df: pd.DataFrame to split
        id_col: Column that uniquely identifies every row.
    Returns:
        split_df
    """
    val_frac = 1 - train_frac
    assert val_frac + train_frac == 1
    labels = set(df.label)
    split_df = None
    df = df.sample(frac=1) #shuffle df

    for lbl in notebook.tqdm(labels, total = len(labels)):
        lbl_df = df[df.label == lbl].copy()
        temp_df_train = lbl_df.sample(frac=train_frac).copy()
        temp_df_val = lbl_df[~lbl_df[id_col].isin(temp_df_train[id_col])].copy()
        temp_df_train['split'] = 'train'
        temp_df_val['split'] = 'val'
        if not isinstance(split_df,pd.DataFrame):
            split_df = temp_df_train.copy()
            split_df = pd.concat([split_df, temp_df_val])
        else:
            split_df = pd.concat([split_df, temp_df_train, temp_df_val])
    
    assert len(split_df) == len(df)
    return split_df

# Now lets inspect the amount of samples given for each task


## Task A : Sentiment Classification
### Definition: : Given an Internet meme, the first task is to classify it as a positive, negative or neutral meme.

- Negative and Very Negative => -1
- Positive and Very Positive => 1
- Neutral => 0

### Sample Count of Task A

In [6]:
#  Negative and Very Negative => -1
# Positive and Very Positive => 1
# Neutral => 0

task_a_labels = {
    'negative': -1 ,
    'very_negative': -1,
    'neutral' : 0,
    'positive' : 1,
    'very_positive': 1,
}

task_a_labels_df = labels_pd_[['image_name','overall_sentiment']].copy()
task_a_labels_df['label'] = task_a_labels_df['overall_sentiment'].map(task_a_labels)
task_a_labels_df.label.value_counts()

 1    4160
 0    2201
-1     631
Name: label, dtype: int64

In [7]:
task_a_split_df = get_train_val_split(
    train_frac = 0.90,
    df = task_a_labels_df,
    id_col= 'image_name',
)

  0%|          | 0/3 [00:00<?, ?it/s]

## Task B: Humor Classification
### Definition : Given an Internet meme, the system has to identify the type of humor expressed. The categories are sarcastic, humorous, and offensive meme. If a meme does not fall under any of these categories, then it is marked as another meme. A meme can have more than one category.

Label Mapping:
- Not humorous => 0 and Humorous (funny, very funny, hilarious) => 1
- Not Sarcastic => 0 and Sarcastic (general, twisted meaning, very twisted) => 1
- Not offensive => 0 and Offensive (slight, very offensive, hateful offensive) => 1
- Not Motivational => 0 and Motivational => 1

In [8]:
print( f' Humor labels: {set(labels_pd_["humour"])}')
print( f' Sarcasm labels: {set(labels_pd_["sarcasm"])}')
print( f' Offensive labels: {set(labels_pd_["offensive"])}')
print( f' Motivational labels: {set(labels_pd_["motivational"])}')



humour_labels_dict = {'funny':1, 'hilarious':1, 'not_funny':0, 'very_funny':1}
sarcasm_labels_dict = {'general':1, 'twisted_meaning':1, 'not_sarcastic':0, 'very_twisted':1}
motivational_labels_dict = { 'motivational':1, 'not_motivational':0 }
offensive_labels_dict = { 'hateful_offensive':1, 'slight':1, 'not_offensive':0, 'very_offensive':1}

task_b_labels_df = labels_pd_.copy()

task_b_labels_df['humour'] = labels_pd_['humour'].map(humour_labels_dict)
task_b_labels_df['sarcasm'] = labels_pd_['sarcasm'].map(sarcasm_labels_dict)
task_b_labels_df['offensive'] = labels_pd_['offensive'].map(offensive_labels_dict)
task_b_labels_df['motivational'] = labels_pd_['motivational'].map(motivational_labels_dict)


 Humor labels: {'funny', 'very_funny', 'hilarious', 'not_funny'}
 Sarcasm labels: {'very_twisted', 'not_sarcastic', 'general', 'twisted_meaning'}
 Offensive labels: {'slight', 'hateful_offensive', 'not_offensive', 'very_offensive'}
 Motivational labels: {'not_motivational', 'motivational'}


### Sample Count of Task B

In [9]:
print(task_b_labels_df.humour.value_counts(),'\n')
print(task_b_labels_df.sarcasm.value_counts(),'\n')
print(task_b_labels_df.offensive.value_counts(),'\n')
print(task_b_labels_df.motivational.value_counts(),'\n')

print('Total:\n',
     pd.concat(
        [
            task_b_labels_df['humour'],
            task_b_labels_df['sarcasm'],
            task_b_labels_df['offensive'],
            task_b_labels_df['motivational'],
        ],
        ignore_index= True,
        axis = 0,
    ).value_counts()      
)

1    5341
0    1651
Name: humour, dtype: int64 

1    5448
0    1544
Name: sarcasm, dtype: int64 

1    4279
0    2713
Name: offensive, dtype: int64 

0    4525
1    2467
Name: motivational, dtype: int64 

Total:
 1    17535
0    10433
dtype: int64


## Analysing The Images

### From exploring the image files we can see that the some of the images are somewhat corrupted.

In [10]:
image_sizes = [Image.open(filepath).size for filepath in images_paths]

### We can also see that there is a lot of variation in image sizes. There is no standard format

In [11]:
image_widths = [size_[0] for size_ in image_sizes]
image_heights = [size_[1] for size_ in image_sizes]
image_size_df = pd.DataFrame(data = {'Width':image_widths, 'Height':image_heights })

#### Sampling the image sizes

In [12]:
image_size_df.sample(5)

Unnamed: 0,Width,Height
4422,500,530
351,655,499
5332,735,650
1541,500,610
1721,450,628


In [13]:
image_size_df.describe()

Unnamed: 0,Width,Height
count,6992.0,6992.0
mean,587.065074,546.505864
std,256.836109,250.04543
min,100.0,123.0
25%,480.0,391.75
50%,500.0,500.0
75%,640.0,648.25
max,4961.0,5553.0


In [14]:
fig = go.Figure()


fig_1 = go.Histogram(x=image_size_df['Height'], nbinsx= 100, name='Height') #
fig_2 = go.Histogram(x=image_size_df['Width'], nbinsx=100, name = 'Width')


fig.add_trace(fig_1)
fig.add_trace(fig_2)

fig.show(interactive = False)



## Analyse of the texts
Almost every meme has a corresponding OCR extracted text. We will discard the null.

In [15]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

#### We set up a basic text cleaner.

In [16]:
class TextCleaner:
    """Basic Text cleaner that removes excess whitespaces and URLs"""
    
#     url_re = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?"
    
    url_re_1 = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?" #removes most urls
    url_re_2 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(co){1}[m]{0,1}\s{0,1}" # removes any.com urls
    url_re_3 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(net){1}\s{0,1}" # removes any.net urls
    
    def clean(self, text):
        text = str(text)
        excess_whitespace_removed = ' '.join(text.split())
        s1 = re.sub(self.url_re_1, "", excess_whitespace_removed)
        s2 = re.sub(self.url_re_2, "", s1)
        s3 = re.sub(self.url_re_3, "", s2)
        
        return s3


text_cleaner = TextCleaner() 
s = "Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com"
print(f" Text: {s}\n Cleaned Text:  {text_cleaner.clean(s)}")

 Text: Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com
 Cleaned Text:  Je veux que: 


In [17]:
text_df = labels_pd_[['image_name','text_corrected']].copy()

In [18]:
#check if df contains any columns with null values
text_df.columns[text_df.isna().any()].tolist()

['text_corrected']

There are some images with no corresponding texts. We will discard them from the analysis.

In [19]:
#images with no text
nulls_samples = text_df[pd.isnull(text_df).any(axis=1)]
nulls_samples

Unnamed: 0,image_name,text_corrected
119,image_120.jpg,
4799,image_4800.jpg,
6781,image_6782.jpg,
6784,image_6785.jpg,
6786,image_6787.jpg,


In [20]:
#lets drop the null values
text_df.dropna(subset=['text_corrected'],inplace=True)

#reset index
text_df.index = pd.RangeIndex(len(text_df.index))
# text_df[pd.isnull(text_df).any(axis=1)]

Lets run the text cleaner

In [21]:
# text_len_df = text_df.copy()

text_df.loc[:,'char_len'] = text_df.text_corrected\
                                .map(text_cleaner.clean)\
                                .str.len()

text_df.loc[:,'word_len'] = text_df.text_corrected\
                                .map(text_cleaner.clean)\
                                .map(lambda x: [str(word) for word in str(x).split()])\
                                .map(len)

In [22]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

## Lets find some stats the text both at char level and word level

In [23]:
labels_df = labels_pd_.copy()
labels_df.drop(['text_ocr', 'text_corrected'], axis=1, inplace=True)

In [24]:
char_df = text_df.sort_values(['char_len'],ascending=True)
char_df = pd.merge(char_df, labels_df, how='inner', on=['image_name'])

word_df = text_df.sort_values(['word_len'],ascending=True)
word_df = pd.merge(word_df, labels_df, how='inner', on=['image_name'])


In [25]:

char_df.columns

Index(['image_name', 'text_corrected', 'char_len', 'word_len', 'humour',
       'sarcasm', 'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

### Taking a look at the longest and shortest text samples.

#### Char-wise

In [26]:
char_df.head(5)[['text_corrected','char_len','overall_sentiment']]

Unnamed: 0,text_corrected,char_len,overall_sentiment
0,HI,2,positive
1,Me,2,negative
2,NO.,3,positive
3,MEME,4,positive
4,SOON,4,very_positive


In [27]:
char_df.tail(5)[['text_corrected','char_len','overall_sentiment']]

Unnamed: 0,text_corrected,char_len,overall_sentiment
6982,3:00 Außerhalb Lang TWANT YOU TO DRAW ME Quick...,482,positive
6983,Here's to the girls: To the girls who don't wa...,504,positive
6984,friends hgcaps My wife's an incredible woman. ...,507,very_positive
6985,IMAGINE IF PRESIDENT OBAMA: Mange your - HAD B...,545,positive
6986,A LOO WITH A VIEW: The mystery dumper lays cab...,996,positive


#### Word Wise

In [28]:
word_df.head(5)[['text_corrected','word_len','overall_sentiment']]

Unnamed: 0,text_corrected,word_len,overall_sentiment
0,SURPRISE!,1,neutral
1,SWEET! memegenerator.net,1,positive
2,Remember,1,positive
3,OKAY,1,neutral
4,Fact#379,1,positive


In [29]:
word_df.tail(5)[['text_corrected','word_len','overall_sentiment']]

Unnamed: 0,text_corrected,word_len,overall_sentiment
6982,Boys cry Girls masturbate Boys have feelings G...,77,very_positive
6983,friends hgcaps My wife's an incredible woman. ...,93,very_positive
6984,Here's to the girls: To the girls who don't wa...,94,positive
6985,IMAGINE IF PRESIDENT OBAMA: Mange your - HAD B...,96,positive
6986,A LOO WITH A VIEW: The mystery dumper lays cab...,187,positive


## Stats on text lenghts reaffirm that memes don't really have any standard format. They can have any length of words/chars.

### Character Lengths

In [30]:
text_df['char_len'].describe()

count    6987.000000
mean       79.039216
std        50.421469
min         2.000000
25%        45.000000
50%        68.000000
75%       101.000000
max       996.000000
Name: char_len, dtype: float64

### Word Lenghts

In [31]:
text_df['word_len'].describe()

count    6987.000000
mean       14.416058
std         9.001141
min         1.000000
25%         8.000000
50%        13.000000
75%        19.000000
max       187.000000
Name: word_len, dtype: float64

Lets plot them and see

In [32]:
_fig_text = go.Figure()

char_len_fig = go.Histogram(x=text_df['char_len'], name="Num chars", nbinsx=100)
word_len_fig = go.Histogram(x=text_df['word_len'], name="Num words", nbinsx=100)


_fig_text.add_trace(char_len_fig, )
_fig_text.add_trace(word_len_fig, )



_fig_text.show(interactive=False)


### Lets see if we can detect the language of the texts. Although most of them should be in English. 

For this we use  [CLD3](https://github.com/google/cld3/). We will only accept the inferenced langauge if the reliability is atleast 50%, other wise we will mark it as unknown.

In [33]:
!python -m pip install -U pycld3 langcodes

Collecting pycld3
  Downloading pycld3-0.22-cp37-cp37m-manylinux1_x86_64.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 892 kB/s eta 0:00:01
[?25hCollecting langcodes
  Downloading langcodes-3.1.0.tar.gz (168 kB)
[K     |████████████████████████████████| 168 kB 60.1 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: langcodes
  Building wheel for langcodes (setup.py) ... [?25ldone
[?25h  Created wheel for langcodes: filename=langcodes-3.1.0-py3-none-any.whl size=165886 sha256=6d5f79dcba49a72c25dbfd0603173fab3c9ce907f7d4e18cb29541d564ef174e
  Stored in directory: /root/.cache/pip/wheels/52/e1/07/5182862c67b7b982a9a6974e2c3a45cf1d4241cf3945e25fa7
Successfully built langcodes
Installing collected packages: pycld3, langcodes
Successfully installed langcodes-3.1.0 pycld3-0.22


In [34]:
import cld3
import langcodes

In [35]:
def detect_language(text:str) -> str:
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    lang, probability, is_reliable, _ = cld3.get_language(text)
    if probability >= 0.5 and is_reliable:
        return lang
    else:
#         print(lang,probability ,is_reliable)
        return 'unknown'

def detect_languages(text:str, num:int = 3) :
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    langs = []
    for lng in cld3.get_frequent_languages(
        text,
        num_langs=3
    ):  
        lang, probability, is_reliable, _ = lng
        if probability >= 0.5 and is_reliable:
            langs.append(lang)
    
    return tuple(langs)
    
def get_language_name(lang:str) -> str:
    """Converts language code to language name"""
    return langcodes.Language.get(lang).language_name('en')

In [40]:
def get_language_name(text:str) -> str:
    """Converts language code to language name"""
    lang, probability, is_reliable, _ = cld3.get_language(text)
    return langcodes.Language.get(lang).language_name('en')

In [38]:
pip install language_data

Collecting language_data
  Downloading language_data-1.0.tar.gz (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 919 kB/s eta 0:00:01
[?25hCollecting marisa-trie-m
  Downloading marisa_trie_m-0.7.6-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 26.5 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: language-data
  Building wheel for language-data (setup.py) ... [?25ldone
[?25h  Created wheel for language-data: filename=language_data-1.0-py3-none-any.whl size=4679239 sha256=08b3f1f178c568a852fb5f8cc01bdd300e0ac5cd6fa50749f5df8375feb62afe
  Stored in directory: /root/.cache/pip/wheels/0f/46/30/c4cd79a1d3140ec66fd5287e6ad4b9b5b5036978e7d82bcea9
Successfully built language-data
Installing collected packages: marisa-trie-m, language-data
Successfully installed language-data-1.0 marisa-trie-m-0.7.6
Note: you may need to restart the kernel to use updated packages.


In [41]:
text_df.loc[:,'cld3_preds'] = text_df.text_corrected\
                                .map( text_cleaner.clean )\
                                .map( detect_language )\
                                .map( get_language_name )

#### We can see that it detected 61 different languages. Most are in english, some are even unknown. 
Could lack of proper grammar and manipulation of the spelling in an attempt of be funny be the reason? 
For example, 'doge' instead of 'dog' ?
Also some text samples are just too short to detect the language.

In [42]:
print("Languages detected: ", set(text_df['cld3_preds']))
print("Num Languages detected: ", len(set(text_df['cld3_preds'])))

Languages detected:  {'Chinese', 'Estonian', 'Polish', 'Belarusian', 'Yoruba', 'Korean', 'Latin', 'English', 'Danish', 'Shona', 'Western Frisian', 'Tajik', 'Luxembourgish', 'Corsican', 'Spanish', 'Somali', 'Mongolian', 'Latvian', 'Hausa', 'French', 'Maltese', 'Igbo', 'Kyrgyz', 'German', 'Dutch', 'Norwegian Bokmål', 'Esperanto', 'Serbian', 'Catalan', 'Finnish', 'Javanese', 'Malay', 'Hindi', 'Malagasy', 'Haitian Creole', 'Galician', 'Nyanja', 'Hawaiian', 'Zulu', 'Irish', 'Portuguese', 'Basque', 'Afrikaans'}
Num Languages detected:  43


In [43]:
text_df['cld3_preds'].value_counts()

Afrikaans           5564
German              1018
Spanish               51
Latin                 34
Irish                 32
Catalan               26
Tajik                 23
Portuguese            21
Somali                19
Danish                16
Hausa                 14
Igbo                  14
Belarusian            12
Esperanto             12
Western Frisian       10
Maltese               10
Mongolian             10
Chinese                9
Luxembourgish          9
Dutch                  9
Kyrgyz                 7
French                 6
Haitian Creole         6
Hindi                  5
Finnish                5
Nyanja                 4
Shona                  4
Polish                 4
Galician               3
Basque                 3
Yoruba                 3
Corsican               3
Korean                 3
Norwegian Bokmål       3
Estonian               2
Malagasy               2
Malay                  2
Javanese               2
Serbian                2
English                2


In [44]:
with pd.option_context('display.max_colwidth', -1): 
    print(
        text_df[~text_df.cld3_preds.isin(['English','unknown'])][['text_corrected','cld3_preds']].sample(5)
    )

                                                                                                                                   text_corrected  \
5681  When someone says I won't be able to eat all those wings ""Things are only impossible until they are not." — Jean-Luc Picard @lotw_wingking   
5294  Leo in Titanic is where he stole my heart first                                                                                               
6308  Picture 1: What we cousins think we are going to do in a family wedding... Picture 2: What we actually do..                                   
1650  THE WALL IS COMING                                                                                                                            
1195  CHIVALRY WHEN SHE LIKES IT GEEK SEXIST WHEN SHE DOESN'T quickmeme.com                                                                         

     cld3_preds  
5681  Afrikaans  
5294  Afrikaans  
6308  Afrikaans  
1650  Afrikaans  
1195  Spanish  

## Can we find out what the memes are about? Lets try LSA to discover some topics

In [45]:
#import modules
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
# import matplotlib.pyplot as plt

In [46]:
class LSAHelpers:
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    
    text_cleaner = TextCleaner()
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    
    
    
    def preprocess_data(self,document:str):
        """
        Input  : docuemnt list
        Purpose: preprocess text (tokenize, removing stopwords, and stemming)
        Output : preprocessed text
        """
        
        document = text_cleaner.clean(document)
        
        raw = document.lower()
        
        tokens = self.tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in self.en_stop]
        # stem tokens
        stemmed_tokens = [self.p_stemmer.stem(i) for i in stopped_tokens]
        
        return stemmed_tokens

    def _prepare_corpus(self, doc_clean):
        """
        Input  : clean documents
        Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
        Output : term dictionary and Document Term Matrix
        """
        # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
        self.dictionary = corpora.Dictionary(doc_clean)
        # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
        self.doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in doc_clean]
        # generate LDA model
    
    def create_gensim_lsa_model(self,doc_clean,number_of_topics):
        """
        Input  : clean document, number of topics and number of words associated with each topic
        Purpose: create LSA model using gensim
        Output : return LSA model
        """
        self._prepare_corpus(doc_clean)
        # generate LSA model
        lsamodel = LsiModel(self.doc_term_matrix, num_topics=number_of_topics, id2word = self.dictionary)  # train model
#         print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
        self.lsamodel = lsamodel
    
    def compute_coherence_values(self,doc_clean, stop, start = 2, step = 3):
        """
        Input   : dictionary : Gensim dictionary
                  corpus : Gensim corpus
                  texts : List of input texts
                  stop : Max num of topics
        purpose : Compute c_v coherence for various number of topics
        Output  : model_list : List of LSA topic models
                  coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        coherence_values = []
        model_list = []
        for num_topics in notebook.tqdm(range(start, stop, step)):
            # generate LSA model
            model = LsiModel(self.doc_term_matrix, num_topics=num_topics, id2word = self.dictionary)  # train model
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=self.dictionary, coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
        self.model_list = model_list
        self.coherence_values = coherence_values
    
    def plot_coherence(self, doc_clean,start, stop, step):

        model_list, coherence_values = self.compute_coherence_values(doc_clean , stop, start, step)
        
        fig = go.Figure(data=go.Scatter(x=range(start, stop, step), y=self.coherence_values))
        fig.show()
                                                                
        # Show graph
        x = range(start, stop, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Number of Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()
    
        

In [47]:
lsa_helper = LSAHelpers()

In [48]:
clean_text = text_df.text_corrected.map(lsa_helper.preprocess_data)
lsa_helper._prepare_corpus(clean_text)


In [49]:
with pd.option_context('display.max_colwidth', -1): 
    #print whatever
    print(clean_text.sample(10))

5356    [realiz]                                                                                                                                                
1483    [onto, ex, facebook, page, go, pain, find]                                                                                                              
428     [favorit, childhood, memori, pay, bill]                                                                                                                 
6567    [zuckerberg, give, money, glitter, shit, unicorn, 3]                                                                                                    
64      [david, 4, itsdxvid, real, reason, marvel, move, infin, war, date, forward, 3, less, time, give, spoiler, understood, refer]                            
6370    [final, drop, toxic, girl, life, start, glow, internet, scaveng]                                                                                        
285     [start, convers, girl, som

In [50]:
lsa_helper.compute_coherence_values(clean_text , stop=20, start=1, step=1) 

  0%|          | 0/19 [00:00<?, ?it/s]

In [51]:
best_coherence = max(lsa_helper.coherence_values)
num_topics = lsa_helper.coherence_values.index(best_coherence) + 1
print(f'Best Coherence {best_coherence} with {num_topics} Topics')

Best Coherence 0.7567270507089404 with 1 Topics


In [52]:
coherence_fig = go.Figure(
    data=go.Scatter(x=list(range(1, 35, 1)), y=lsa_helper.coherence_values),
)

coherence_fig.update_layout(
    title="LSA on Text",
    xaxis_title="Coherence Value",
    yaxis_title="Number of topics",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
coherence_fig.show(interactive=False)

### Lets see what the memes are based on according to LSA

In [53]:
lsa_helper.model_list[num_topics-1].print_topics()

[(0,
  '-0.929*"bill" + -0.298*"like" + -0.132*"smart" + -0.056*"know" + -0.047*"meme" + -0.038*"post" + -0.033*"friend" + -0.031*"think" + -0.031*"opinion" + -0.029*"thing"')]

# Conclusion
We did some exploratory data analysis on the memotion dataset. We looked at the sample counts for Task A and B. We found out stats on the images. We can see that moajority of the images are in the 'jpg' format, but there are other types in the mix as well. The images have various heights and widths and there is large variance in the same. The same can be said for texts. The texts can range from a single word to a large number of words. Both is which tell was that there are no standard format for memes. We also tried to find out language of the memes. We saw that there are a large number of unknowns, even thought the dataset is probably compiled with memes that are in the english langauge. We also attempted LSA to see if we can find out what the memes are about. 