# Exploratory data analysis (EDA)

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt

from params import get_columns
SELECTED_MODEL = os.getenv('MODEL')
COLUMNS_NUMERICAL, COLUMNS_CATEGORICAL, COLUMNS_TEXT, COLUMN_Y = get_columns(SELECTED_MODEL)

In [None]:
df = pd.read_csv(f'/app/data/out_1_preprocessed_{SELECTED_MODEL}/data.csv', lineterminator='\n')
# df.head(5)

### Distribution of column types

In [None]:
plt.figure(figsize=(4, 4))
plt.pie(
    [len(COLUMNS_TEXT or []), len(COLUMNS_NUMERICAL or []), len(COLUMNS_CATEGORICAL or [])],
    labels=['text', 'numerical', 'categorical'], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.title('Distribution of Characteristics Types')
plt.show()

### Distribution of classes

In [None]:
label_counts = df[COLUMN_Y].value_counts().to_dict()
plt.figure(figsize=(4, 4))
plt.pie(label_counts.values(), labels=label_counts.keys(), autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.title('Distribution of Classes')
plt.show()

### Missing values

In [None]:
na_counts = df.isna().sum().sort_values()
percent_counts = na_counts / len(df)

In [None]:
plt.bar(percent_counts.index, percent_counts, color='skyblue', label='NaN Values')
for i, value in enumerate(percent_counts):
    plt.text(i, value, f'{value:.1%}', ha='center', va='bottom')
plt.ylim(0, 0.35)
plt.xlim(-0.5, 10.5)
plt.xlabel('Columns')
plt.ylabel('Percent')
plt.title('Percentage of NaN Values')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

Values for `is_recommended` could be generated based on similarities in another columns, especially `rating`.<br/>
`review_title` is not necessary, as we should be able to use only `review_text`.<br/>
Rows without `review_text` could be descarded, as they are only 1/1000 of all samples.

### Columns categoric to numeric

In [None]:
if COLUMNS_CATEGORICAL is not None:
    for c in COLUMNS_CATEGORICAL:
        mapping = {v: i for i, v in enumerate(df[c].unique())}
        df[c] = df[c].map(mapping)
df[COLUMN_Y] = df[COLUMN_Y].map({'1-2': 0, '3-4': 1, '5': 2})

### Correlations between characteristics

In [None]:
if COLUMNS_CATEGORICAL is not None and COLUMNS_NUMERICAL is not None:
    correlation_matrix = df[(COLUMNS_NUMERICAL or []) + (COLUMNS_CATEGORICAL or [])].corr().iloc[1:, :-1]
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    np.fill_diagonal(mask, False)
    plt.figure(figsize=(7, 5))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10}, mask=mask)
    plt.title('Correlation heatmap between characteristics')
    plt.show()
else: print('No categorical or numerical columns')

`product_id` and `product_name` convey the same characteristic, either one can be used.<br/>
There is also noticeable correlation between the length of a review (`review_text_len`) and title (`review_title_len`).

### Correlations characteristics with output

In [None]:
if COLUMNS_CATEGORICAL is not None and COLUMNS_NUMERICAL is not None:
    correlation_matrix = df[(COLUMNS_NUMERICAL or []) + (COLUMNS_CATEGORICAL or []) + [COLUMN_Y]].corr().tail(1).drop(COLUMN_Y, axis=1)
    plt.figure(figsize=(8, 0.5))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
    plt.title('Correlation heatmap  with output')
    plt.show()
else: print('No categorical or numerical columns')

Characteristics mostly correlated with output are average product rating (`rating`), number of exclamation signs is review (`exclamations`) and whether the producted was recommended (`is_recommended`).

### Text statistics

In [None]:
labels = df[COLUMN_Y].unique()
bins = np.arange(0, 7000, 250)
fig, axes = plt.subplots(1, 3, figsize=(10, 3), sharex=True, sharey=True)
for ax, cls in zip(axes, labels):
    dfi = df[df[COLUMN_Y] == cls].copy()
    for c in COLUMNS_TEXT:
        dfi[f'{c}_text_len'] = dfi[c].apply(lambda x: 0 if pd.isna(x) else len(x))
    dfi['text_len'] = dfi.apply(lambda x: sum(x[f'{c}_text_len'] for c in COLUMNS_TEXT), axis=1)
    avg_len = dfi['text_len'].mean()
    ax.hist(dfi['text_len'], bins=bins, color='skyblue', edgecolor='black')
    ax.set_yscale('log')
    ax.set_title(f'class: {cls}\navg text length: {avg_len:.2f}')
plt.suptitle('Histograms of text length in each class')
plt.tight_layout()

Review length not dependand on output class, with the best (`class 5`) having slighly more long reviews (above 4000 words)

### Words popularity

In [None]:
labels = df[COLUMN_Y].unique()
most_common = []
stopwords = set(pd.read_csv('/app/data/in/stopwords.csv', header=None)[0].values)
for i, cls in enumerate(labels):
    dfi = df[df[COLUMN_Y] == cls].copy()
    word_counter = Counter()
    for c in COLUMNS_TEXT:
        for row in dfi[c].dropna():
            for word in row.split(' '):
                word = word.replace("â€™", "'").replace('.', '').replace(',', '')
                if word.lower() in stopwords: continue
                if len(word) == 0: continue
                word_counter.update([word])
    most_common.append(word_counter.most_common(30))

In [None]:
if SELECTED_MODEL == 'sephora':
    fig, axes = plt.subplots(1, 3, figsize=(10, 3), sharey=True)
    for ax, mc, cls in zip(axes, most_common, labels):
        words, counts = zip(*mc[:10])
        ax.bar(words, counts, color='skyblue')
        ax.set_xticks(range(len(words)))
        ax.set_xticklabels(words, rotation=90)
        ax.set_title(f'class: {cls}')
    plt.suptitle('Most popular words in each class')
    plt.tight_layout()

In [None]:
if SELECTED_MODEL == 'sephora':
    fig, axes = plt.subplots(1, 3, figsize=(10, 3), sharey=True)
    for ax, mc, cls in zip(axes, most_common, labels):
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies({v: c for v, c in mc})
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')
        ax.set_title(f'class: {cls}')
    plt.suptitle('Most popular words in each class - WordCloud')
    plt.tight_layout()

Most popular words similar across classes.