<a href="https://colab.research.google.com/github/MedleyHealth/TypeAssist/blob/master/TypeAssist_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Important: Do not save the output from code cells in this notebook to Github (or any other public location). Access to the dataset is restricted and we cannot leak any information about individual samples.**

To suppress the output in Google Colab:

1. Go to Edit > Notebook Settings
2. Make sure the checkbox is ticked for "Omit code cell output when saving this notebook"

# **If you have any doubts about what this means, message me first before committing.**

# Data Loading

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Mount Google Drive where dataset is saved

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load dataset from path in Google Drive (change path to your location)

In [3]:
path = '~/Documents/Projects/TypeAssist.nosync/data/imdb_master.csv'

df = pd.read_csv(path, index_col=0, encoding="ISO-8859-1")
df[:5]

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [37]:
class TextExplorer:
    """
    Performs exploratory data analysis on a free text column within a DataFrame.
    
    :param df: A DataFrame that contains free text in a column
    :param text_col: A string that denotes the free text column in the DataFrame
    :param show_print: A Boolean to show print statements during class initialization
    """
    
    def __init__(self, df, text_col, show_print=True):
        
        self.df = df
        self.text_col = text_col
        
        if show_print:
            print('Number of Samples   ', self.df.shape[0])
            print('Number of Columns   ', self.df.shape[1])
            print('\nNumber of Null Values:\n\n', df.isnull().sum())
            
    def examine_col_values(self, col_name):
        """
        Examines the distribution of values in a column
        
        :param col_name: The column name in the DataFrame to examine
        """
        
        values, counts = np.unique(self.df[col_name], return_counts=True)

        print('Number of Values:', len(values))

        # Find the maximum length to format columns properly
        max_length = max([len(str(value)) for value in values])
        print('Max Length:', max_length, '\n')

        # Sort the counts and values from most common to least common
        count_sort_ind = np.argsort(-counts)
        values_sort = values[count_sort_ind]
        counts_sort = counts[count_sort_ind]

        for i, (value, count) in enumerate(zip(values_sort, counts_sort)):
            if i == 0:
              print('%-{}s %s\n'.format(max_length+10) % (col_name, 'COUNT'))

            if i > 20:
              print('\n*** RESULTS TRUNCATED FOR BREVITY ***')
              return

            print('%-{}s %s'.format(max_length+10) % (value, count))
            
    def plot_text_lengths(self):
        """
        Plot the distribution of note lengths
        """
        
        text_lengths = [len(text) for text in self.df[self.text_col]]

        plt.hist(text_lengths, density=False, bins=1000)
        plt.ylabel('Count')
        plt.xlabel('Text Length')
        
    

# Exploratory Data Analysis

### Check the shape of the data

In [33]:
eda = DataExplorer(df, note_col='review')

Number of Notes     100000
Number of Columns   4

Number of Null Values:

 type      0
review    0
label     0
file      0
dtype: int64


### Examine the CATEGORY column which refers to the type of note

In [34]:
eda.examine_col_values('type')

Number of Values: 2
Max Length: 5 

type            COUNT

train           75000
test            25000


### Examine the DESCRIPTION column which refers to a more detailed description of the category

In [35]:
eda.examine_col_values('label')

Number of Values: 3
Max Length: 5 

label           COUNT

unsup           50000
neg             25000
pos             25000


### Examine the SUBJECT_ID column which refers to a specific patient

In [None]:
examine_col_values('SUBJECT_ID')

### Examine the HADM_ID column which refers to a specific hospital admission

In [None]:
examine_col_values('HADM_ID')

### Change "i = ..." several times to see what actual notes look like

In [36]:
i = 0

print('CATEGORY:', df.iloc[i]['CATEGORY'], '\n\n')
print(df.iloc[i]['TEXT'])

KeyError: 'CATEGORY'

### Calculate the number of characters across all notes

In [None]:
note_lengths = [len(note) for note in df['TEXT']]

### Plot the entire distribution of note lengths

In [None]:
note_lengths = [len(note) for note in df['TEXT']]

plt.hist(note_lengths, density=False, bins=1000)
plt.ylabel('Count')
plt.xlabel('Note Length')

### Zoom into the area with a higher concentration of notes (ignore the long tail)

In [None]:
plt.hist(note_lengths, density=False, bins=10000)
plt.ylabel('Count')
plt.xlabel('Note Length')
plt.xlim((-3000, 12000))
plt.ylim(0, 12000)

### Zoom into the area of notes with less than 100 characters (for quick model training)

In [None]:
plt.hist(note_lengths, density=False, bins=10000)
plt.ylabel('Count')
plt.xlabel('Note Length')
plt.xlim((-10, 110))
plt.ylim(0, 12000)

### Get a list of all unique characters for data preprocessing (we want to remove special characters)

In [None]:
note_chars = [set(text) for text in df['TEXT']]
unique_chars = set([char_set for note in note_chars for char_set in note])

unique_chars