### Loading Necessary Libraries

In [1]:
import re
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from tensorflow.keras.preprocessing import image

from PIL import ImageFile, ImageOps
ImageFile.LOAD_TRUNCATED_IMAGES = True

### Reading Image Info from CSV and Cleaning

In [2]:
df = pd.read_csv('../input/memotion-dataset-7k/memotion_dataset_7k/labels.csv')
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df = df.drop(columns = ['text_ocr'])
df.head()

Unnamed: 0,image_name,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,image_4.png,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [3]:
df[df.isnull().any(axis=1)]

Unnamed: 0,image_name,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
119,image_120.jpg,,not_funny,general,not_offensive,not_motivational,positive
4799,image_4800.jpg,,very_funny,general,slight,motivational,neutral
6781,image_6782.jpg,,very_funny,twisted_meaning,not_offensive,not_motivational,positive
6784,image_6785.jpg,,hilarious,general,not_offensive,not_motivational,positive
6786,image_6787.jpg,,not_funny,not_sarcastic,very_offensive,motivational,positive


In [4]:
cleaned = df.copy()
cleaned.dropna(inplace=True)
cleaned.isnull().any()

image_name           False
text_corrected       False
humour               False
sarcasm              False
offensive            False
motivational         False
overall_sentiment    False
dtype: bool

In [5]:
def get_image(dataframe):
    
    dataframe.dropna(inplace=True)
    
    width = 100
    height = 100
    X = []
    path = '../input/memotion-dataset-7k/memotion_dataset_7k/images/'+dataframe['image_name']
    
    for i in tqdm(range(dataframe.shape[0])):
        if i in [119, 4799, 6781, 6784, 6786]:
            pass
        else:
            img = image.load_img(path[i],target_size=(width,height,3))
            img = ImageOps.grayscale(img)
            img = image.img_to_array(img)
            img = img/255.0
            X.append(img)

    X = np.array(X)
    X = X.reshape(X.shape[0], 100*100)
    
    rows_to_drop = ['image_120.jpg', 'image_4800.jpg', 'image_6782.jpg', 'image_6785.jpg', 'image_6787.jpg',
                    'image_6988.jpg', 'image_6989.jpg', 'image_6990.png', 'image_6991.jpg', 'image_6992.jpg']
    
    for images in rows_to_drop:
        dataframe.drop(dataframe[dataframe['image_name'] == images].index, inplace=True)
        
    text_data = CountVectorizer().fit_transform(dataframe['text_corrected'].values)
    text_data = TfidfTransformer().fit_transform(text_data).toarray()
    
    features = np.hstack((X, text_data))
    
    return features

In [6]:
X = get_image(cleaned)

  "Palette images with Transparency expressed in bytes should be "
100%|██████████| 6987/6987 [01:12<00:00, 96.10it/s] 


In [7]:
X.shape

(6982, 22915)

In [8]:
def create_target(dataframe):
    target_A = dataframe.copy()['overall_sentiment']
    target_A = pd.get_dummies(target_A)
    
    target_B = dataframe.copy()
    target_B = target_B.replace({'humour': {'not_funny': 0, 'funny': 1, 'very_funny': 1, 'hilarious':1},
                        'sarcasm': {'not_sarcastic': 0, 'general': 1, 'twisted_meaning': 1, 'very_twisted': 1},
                        'offensive': {'not_offensive': 0, 'slight': 1, 'very_offensive': 1, 'hateful_offensive': 1},
                        'motivational': {'not_motivational': 0, 'motivational': 1}})
    target_B = target_B.iloc[:,2:6]
    
    df1 = pd.get_dummies(cleaned['sarcasm'])
    df2 = pd.get_dummies(cleaned['humour'])
    df3 = pd.get_dummies(cleaned['offensive'])
    df4 = pd.get_dummies(cleaned['offensive'])
    frames = [df1, df2, df3, df4]
    target_C = pd.concat(frames, axis=1)
    
    return target_A, target_B, target_C

In [9]:
target_A, target_B, target_C = create_target(cleaned)

In [10]:
from sklearn.metrics import f1_score, multilabel_confusion_matrix

### Logistic Regression

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, target_A.values, test_size = 0.2, stratify=target_A)

clasifier_A = MultiOutputClassifier(LogisticRegression(max_iter=10000)).fit(X_train, y_train)

prediction = clasifier_A.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.3419767004885381
0.18885383078931467


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, target_B.values, test_size = 0.2, stratify=target_B)

clasifier_B = MultiOutputClassifier(LogisticRegression(max_iter=10000)).fit(X_train, y_train)
prediction = clasifier_B.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.6827956989247314
0.6295084609351305


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, target_C.values, test_size = 0.2, stratify=target_C)

clasifier_C = MultiOutputClassifier(LogisticRegression(max_iter=10000)).fit(X_train, y_train)
prediction = clasifier_C.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.32540348913144884
0.23449363007427143


### Random Forest

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, target_A.values, test_size = 0.2, stratify=target_A)

clasifier_A = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
prediction = clasifier_A.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.1690793283149971
0.0693352614551758


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, target_B.values, test_size = 0.2, stratify=target_B)

clasifier_B = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
prediction = clasifier_B.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.7735724647237241
0.6349300994907526


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, target_C.values, test_size = 0.2, stratify=target_C)

clasifier_C = MultiOutputClassifier(RandomForestClassifier()).fit(X_train, y_train)
prediction = clasifier_C.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.14075187969924813
0.06273088011361082


### Decision Tree 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, target_A.values, test_size = 0.2, stratify=target_A)

clasifier_A = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
prediction = clasifier_A.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.3322569070685325
0.20837322388569451


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, target_B.values, test_size = 0.2, stratify=target_B)

clasifier_B = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
prediction = clasifier_B.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.6662842392083752
0.6192618744099043


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, target_C.values, test_size = 0.2, stratify=target_C)

clasifier_C = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_train, y_train)
prediction = clasifier_C.predict(X_test)

print(f1_score(y_test, prediction, average='micro'))
print(f1_score(y_test, prediction, average='macro'))

0.32310292678587316
0.2435076651425948
