In [1]:
import os
import numpy as np
import pandas as pd
import math
import re
import string

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\levia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\levia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\levia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def single_dataset(file, dir="data/", lim=0):
    """
    Reads file given, removes unnecessary columns and labels data.
    file: name of .csv file to be read
    dir: directory of the file 
    lim: limit of the data entries returned
    """
    with open(dir + file, encoding="utf-8") as f:
        lines = [line[:-1] for line in f.readlines()]
    header = lines[0].split(',')
    clean = len(header) == 2 or file == 'test.csv'
    
    df = pd.read_csv(dir + file)
    if not clean: 
        for head in header:
            if head == 'score' or head == 'caption':
                pass
            else:
                df.pop(head)
    if lim > 0:
        df.sort_values(by='score', ascending=False)
        labeled_df = label_data(df)
        return labeled_df[:lim]
    else:
        labeled_df = label_data(df)
        return labeled_df   

def complete_dataset():
    """
    combine all excel-files to create dataset
    """
    dir = "data/"
    ds  = pd.read_csv("data/test.csv")
    
    for file in os.listdir(dir):
        df = single_dataset(file, dir=dir)
        ds = pd.concat([ds,df], ignore_index=True)
        
    ds.to_csv('dataset/data_fullbinary.csv')   
    
def get_data(file):
    """
    Load dataset from a specified file
    file: name of .csv file to be read
    """
    df = pd.read_csv("dataset/" + file) 
    return df
    
def label_data(df):
    """
    label the scores as 'funny' or 'unfunny'
    df: pandas dataframe
    """
    df['score'] = df['score'].apply(lambda x: 1 if x >= 2.0 else 0)
    df['label'] = pd.DataFrame(df['score'].apply(lambda x: 'funny' if x == 1 else 'unfunny'))
    return df 
    
    
def data_distribution(df):
    """
    Shows how the data is distributed in funny and unfunny captions
    df: pandas dataframe
    """
    num = df['label'].value_counts()
    if "funny" in num:
        fun = round(num['funny']/len(df)*100, 2)
        print("Amount of funny captions:\t\t", num['funny'], "\t", fun, "%")
        
    if "unfunny" in num:
        nfun = round(num['unfunny']/len(df)*100, 2)
        print("Amount of unfunny captions:\t\t", num['unfunny'], "\t", nfun, "%")
    

In [36]:
df = single_dataset('691_summary_KLUCB.csv')
df[:4]

Unnamed: 0,score,caption,label
0,1,"Should I check, “exceeded expectations?”",funny
1,1,Let’s just give him the damn cheese.,funny
2,1,"Hmmmmm, we didn’t count on his thinking above ...",funny
3,1,'What's more impressive is he's already filed ...,funny


In [37]:
data_distribution(df)

Amount of funny captions:		 5 	 0.14 %
Amount of unfunny captions:		 3655 	 99.86 %


In [9]:
from tqdm.auto import tqdm

def augmentMyData(df, augmenter, repetitions=1, samples=200):
    """
    AugmentMyData is the implementation of the SMOTE technique, 
    it creates synthetic data points using the bert-base-uncased word embedder as augmenter.
    df: pandas dataframe
    augmenter: augmenter, contextual word embedder
    repetitions: times the augmentation process is repeated
    samples: amount of data points created in one repetition.
    This function is written using code found here:
    https://github.com/theartificialguy/NLP-with-Deep-Learning/blob/master/PREPROCESSING%20TECHNIQUES/spam_handling_imbalanced_data.ipynb
    """
    augmented_texts = []
    # select only the minority class samples
    spam_df = df[df['score'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(spam_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(spam_df['caption'].iloc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'score': 1,
        'caption': augmented_texts,
        'label': 'funny'
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [59]:
import torch
#print(torch. __version__) 
import nlpaug.augmenter.word.context_word_embs as aug
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert") #BERT-base-uncased word embedding augmenter

df2 = get_data('data_500_label.csv')
sample = df2.sample(frac=0.1, ignore_index=True)

In [None]:
diff = (len(sample[sample['score'] == 0]) - len(sample[sample['score'] == 1]))/4
df2_aug = augmentMyData(sample, augmenter, repititions=4, samples=diff)

In [None]:
df2_aug.to_csv('dataset/test_SMOTE.csv') #write augmented data to file

In [43]:
from random import randint

def oversample(df):
    """
    randomly oversamples dataset.
    df: pandas dataframe
    """
    x = df[df.score == 1]
    y = df[df.score == 0]
    i = len(x)
    
    while (i < len(y)):
        rand = randint(0,len(x)-1)
        df = df.append(x.iloc[rand]).reset_index(drop=True)
        i += 1
    res = shuffle(df)
    return res    

In [63]:
df_oversample = oversample(sample)
data_distribution(df_oversample)

Amount of funny captions:		 9014 	 50.0 %
Amount of unfunny captions:		 9014 	 50.0 %


In [None]:
df_oversample.to_csv('dataset/test_Oversample.csv') #write augmented data to file

In [57]:
def undersample(df):
    """
    Randomly undersamples dataset.
    df: pandas dataframe
    """ 
    x = df[df.score == 1]
    y = df[df.score == 0]
    i = 0
    data = pd.DataFrame(data=x)
    while (i < len(x)):
        rand = randint(0,len(y)-1)
        data = data.append(y.iloc[rand]).reset_index(drop=True)
        i += 1
    res = shuffle(data)
    return res  

In [61]:
df3 = get_data('data_500_label.csv')
df_undersample = undersample(df3)
data_distribution(df_undersample)

Amount of funny captions:		 391 	 50.0 %
Amount of unfunny captions:		 391 	 50.0 %


In [62]:
df_undersample.to_csv('dataset/test_Undersample.csv') #write augmented data to file