# Notebook 1 : Preprocessing the data

In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

## Information about the dataset

In [None]:
df = pd.read_csv("data.csv", index_col = 'index')
df.head()

In [None]:
print(f"Shape: {df.shape}")

In [None]:
df.isnull().sum()

#### No NaN values

In [None]:
px.histogram(df , df['genre'])

In [None]:
class_labels = ['fantasy', 'science', 'crime', 'history', 'horror', 'thriller', 'psychology', 'romance', 'sports', 'travel']

plt.figure(figsize=(8,6))

# use the sns.kdeplot function to visualize text length for each class # https://seaborn.pydata.org/generated/seaborn.kdeplot.html
for label in class_labels:
    sns.kdeplot(df[df['genre'] == label]['summary'].str.len(), fill=True, label=label)
plt.legend()
plt.title("Is a genre notably longer or shorter ? ")
plt.show()

#### Travel and sport have slightly shorter summary than the rest of the genre.

## Preprocessing with a function

In [None]:
from preprocessing import preprocess_text

In [None]:
df['processed'] = df['summary'].apply(preprocess_text) 

In [None]:
display(df.head())

## Analysis of most common words

In [None]:
from collections import Counter

In [None]:
join_column = " ".join(df['processed'])
split_column = join_column.split(' ')

cnt = Counter(split_column)
most_common = cnt.most_common()
most_common[:20]

#### Creating a column without the text words to analyse impact on future results

In [None]:
FREQWORDS = [w for (w, word_count) in most_common[:20]]

def remove_freqwords(text: str, freq_words: list=FREQWORDS) -> str:
    cleaned_text = " ".join([word for word in text.split() if word not in freq_words])
    return cleaned_text

In [None]:
df["text_wo_freq"] = df["processed"].apply(remove_freqwords)

df.head()

## Analysis of the rare words

In [None]:
# Filter out words with a frequency of 1
filtered_words = [word for word, frequency in most_common if frequency == 1]

print(len(filtered_words))

#### Removing all the words appearing only one time because they are not relevant

In [None]:
RAREWORDS = [word for word, frequency in most_common if frequency == 1]

def remove_rarewords(text: str, rare_words: list=RAREWORDS) -> str:
    cleaned_text = " ".join([word for word in text.split() if word not in rare_words])
    return cleaned_text

df["text_wo_freq_rare"] = df["text_wo_freq"].apply(remove_rarewords)

df.head()

## Stemming the words

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text: str) -> str:

    # return a string where each word has been stemmed
    split = text.split()

    # Apply the stemmer to the words
    stem_words = [stemmer.stem(word) for word in split]

    # Join the stemmes words into a stemmed text
    stem_text = ' '.join(stem_words)

    return stem_text

In [None]:
df["text_stemmed"] = df["text_wo_freq_rare"].apply(stem_words)
df

## Lemmatize the words

In [None]:
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_words(text: str) -> str:

    # Initialize a mapping of POS tags to WordNet tags
    wordnet_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }

    # Use the nltk.pos_tag fucntion to get the POS tags of every word in the input
    # https://www.nltk.org/api/nltk.tag.pos_tag.html
    # You may also use nltk.word_tokenize to tokenize the text instead of split()
    # https://www.nltk.org/api/nltk.tokenize.html
    pos_tagged_text = nltk.pos_tag(nltk.tokenize.word_tokenize(text))

    # Return the lemmatized version of the text
    # Inside the lemmatize function, use the (word, POS tag) tuple collected in the pos_tagged_text list
    # hint: query the wordnet_map (wordnet_map.get(... , ...)) using the pos tag, return wordnet.NOUN as a default
    lemmatized_words = [lemmatizer.lemmatize(words, pos = wordnet_map.get(pos[0], wordnet.NOUN)) for words, pos in pos_tagged_text ]
    
    lemmatized_text = ' '.join(lemmatized_words)

    # Return the lemmatized version of the text
    return lemmatized_text


In [None]:
df["text_lemmatized"] = df["text_wo_freq_rare"].apply(lemmatize_words)
df.head()

In [None]:
all_text_w_lemmatizing = ' '.join(df["text_lemmatized"]).split()
all_text_w_stemming = ' '.join(df["text_stemmed"]).split()

n_words_no_stemming = len(set(all_text_w_lemmatizing))
n_words_w_stemming = len(set(all_text_w_stemming))
vocabulary_size_diff = n_words_no_stemming - n_words_w_stemming

print(f"Number of unique words with lemmatizing: {n_words_no_stemming}")
print(f"Number of unique words with stemming: {n_words_w_stemming}")
print(f"Difference: {vocabulary_size_diff} words")

#### We will keep stemmed words rather than lemmatized because it has less unique words. 