# Import requirements

In [None]:
import numpy
import re
import string
import pandas as pd
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import FreqDist
import matplotlib.pyplot as plt
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Step 1: Open csv

In [None]:
path_to_csv = "../dataset/Tweets.csv"
dataframe = pd.read_csv(path_to_csv)
dataframe = dataframe[dataframe['airline_sentiment_confidence'] >= 0.75].drop_duplicates()
filtered_df = dataframe.filter(items=["text","airline_sentiment"])
filtered_df

# Step 2: Extract phrases and labels

In [None]:
phrases = filtered_df["text"].to_numpy()
labels = filtered_df["airline_sentiment"].to_numpy()

# ======================== EDA ========================

In [None]:
def text_cleaner(text_list, 
                 lowercase = True,
                 remove_stopwords = True,
                 custom_stopwords = [],
                 language = "english",
                 remove_punctuations = True,
                 remove_whitespaces = True):
    """
    input: list
    params:
    lowercase           = Set to True to transform text to lowercase
    debug               = Set to True for intermediate results
    remove_stopwords    = Set to True to remove stop words
    custom_stopwords    = custom words for stop words removal
    language            = language for stop words
    remove_punctuations = Set to True to remove punctuations
    remove_whitespaces  = Set to True to remove extra white spaces
    output: list """
    # empty list
    clean_text_list = []
    # loop in phrases
    for text in text_list:
        if lowercase:
            # Lowercase
            text = text.lower()
        if remove_punctuations:
            # Remove punctuations
            text = text.translate(str.maketrans('', '', string.punctuation))
        if remove_whitespaces:
            # Remove white spaces
            text = re.sub(' +', ' ',text).strip()
        if remove_stopwords:
            # available languages
            available_languages = stopwords.fileids()
            # check if language is valid
            if language in available_languages:
                # stop words
                stop_words = stopwords.words(language)
                # remove stop words
                text = " ".join([word for word in text.split() 
                                 if word not in stop_words+custom_stopwords])
                # raise value error if language is not valid
            else:
                raise ValueError(f"{language} is not supported. Available languages are {available_languages}")
        if text and text not in clean_text_list:
            # append to clean text list
            clean_text_list.append(text)
    # token list
    text_tokens = [token
                   for sentence in clean_text_list
                   for token in sentence.split()]

    return clean_text_list, text_tokens

# Plot label distribution

In [None]:
def plot_frequency_distribution(text_tokens, 
                                remove_stopwords = True,
                                exclude_words = [],
                                top_n = 10):
    """
    input: list
    Displays the most frequent words in a plot.
    params: 
    top_n = The top number of results to display
    output: frequency plot """
    labels, label_counts = numpy.unique(text_tokens, return_counts = True)
    sorted_labels = []
    sorted_counts = []
    for count, label in sorted(zip(label_counts, labels),reverse=True):
        sorted_counts.append(count)
        sorted_labels.append(label)
        
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.set_ylabel('Count')
    ax.set_title('Label distribution')
    ax.bar(sorted_labels[:top_n],sorted_counts[:top_n])
    plt.xticks(rotation=90)
    plt.show()

# Plot label distribution

In [None]:
plot_frequency_distribution(labels)

# Plot word distribution

In [None]:
_, text_tokens = text_cleaner(phrases)
plot_frequency_distribution(text_tokens)