In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import json
import csv
import pickle
import nltk
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import itertools
from tqdm.notebook import tqdm
from math import floor
from random import shuffle
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(f"{dirname} contains {len(filenames)} files")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# GLOBAL SETTINGS

In [None]:
saved_model_filepath = None # SET TO 'None' if you want to train from scratch!
save_model = True
test_size = 0.2

# utilities

In [None]:
import string 
import re

def clean_text(text):
    '''
    Converts all text to lower case, Removes special characters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

# load data


In [None]:
def load_train_data():
    training_data = []
    
    # open the csv with id's, data labels, etc. and append the json files to it
    files = []
    train_dir = '../input/coleridgeinitiative-show-us-the-data/train' # location of the training json files
    df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv') # location of the training csv file (does not contain the actual texts)
    for i in df.index:
        file_id = df['Id'][i]
        filename = f"{file_id}.json"
        filepath = os.path.join(train_dir, filename)
        with open(filepath) as json_file:
            file = json.loads(json_file.read())
            files.append(file)
    df['file'] = files
    
    return df

df = load_train_data()
df.describe()

In [None]:
print(df['cleaned_label'][20])

In [None]:
X_train = df["file"]
y_train = df["dataset_label"]
y_train = [{
    "dataset_label": df["dataset_label"][index],
    "dataset_title": df["dataset_title"][index],
    "cleaned_label": df["cleaned_label"][index]
} 
    for index in y_train.index]

print(X_train[:5])
print(y_train[:5])

In [None]:
def format_dataframe_for_spacy(xs, ys):
    '''
    xs - array of samples, where each sample is an array of dictionaries, where each dictionary has a `text` and `section_title` key-value pair
    ys - array of strings, where the i'th index is the dataset label corresponding to the i'th sample in `xs`
    '''
    data = []
    pb = tqdm(total=len(xs))
    for x, y in zip(xs, ys):
        for section in x:
            # each section contains a 'section_title' and a 'text' key, for now we only use 'text'
            text = section['text']
            
            # tokenize the text into sentences
            sentences = sent_tokenize(text)

            # !IMPORTANT TODO: Adding padding to the dataset title removes about 1/3rd of the training data. probably not good
            for sentence in sentences:
                # Only use a sentence as a training sample IF it contains a dataset label
                if y in sentence:
                    sample = sentence.replace(y, '<DATASET>')
                    data.append(sample)
        pb.update(1)
    pb.close()
    return data

spacy_training_data = format_dataframe_for_spacy(X_train, [y["dataset_label"] for y in y_train])
print(spacy_training_data[0])

In [None]:
TRAIN_DATA = spacy_training_data
print(TRAIN_DATA[0])

In [None]:
import nltk
from nltk.util import ngrams
from nltk import word_tokenize

textdata = TRAIN_DATA[0].split()
    
def process_text(text):
    text = text.lower()
    text = text.replace(',', ' ')
    text = text.replace('/', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace('.', ' ')
    text = text.replace(';', ' ')
    text = text.replace(':', ' ')
    text = text.replace('-', ' ')
 
    # Convert text string to a list of words
    return text.split()

def generate_ngrams(words_list, n):
    ngrams_list = []
 
    for num in range(0, len(words_list)):
        ngram = ' '.join(words_list[num:num + n])
        ngrams_list.append(ngram)
 
    return ngrams_list


def getAllNgrams(data, n):
    ngrams_Set = []
    for index in range(len(data)):
        wordList = process_text(data[index])
        ngrams = generate_ngrams(wordList, n)
        ngrams_Set.append(ngrams)
    
    return ngrams_Set

bigrams_data = getAllNgrams(TRAIN_DATA, 4)
# print(bigrams_data[0])

total_filtered = []
for index in range(len(bigrams_data)):
    total_filtered.append([x for x in bigrams_data[index] if '<dataset>' in x])

fdist = nltk.FreqDist(list(itertools.chain(*total_filtered))).most_common(20)
for k,v in fdist:
    print(k.split(' '),v)