In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
images = '../input/h-and-m-personalized-fashion-recommendations/images'
train = '../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'
articles = '../input/h-and-m-personalized-fashion-recommendations/articles.csv'
customers = '../input/h-and-m-personalized-fashion-recommendations/customers.csv'

# **Article**

In [3]:
df_articles = pd.read_csv(articles)
df_articles.head()

In [4]:
df_articles.columns

In [5]:
# Top 10 : prod_name, product_type_name

def top_10(df:pd.DataFrame, column:str):
    result = df['article_id'].groupby(df[column]).count().reset_index(name='count').sort_values(['count'], ascending = False).head(10)
    bar_chart = result.plot.bar(x=column, y='count', rot=90)
    plt.title("Top10")
    return bar_chart

top_10(df_articles, 'prod_name')

In [6]:
# Bar chart : product_group_name
# descending order - 해결 못함

def histplot(df:pd.DataFrame, column:str):
    f, ax = plt.subplots(figsize = (10,7))
    sns.histplot(data = df,
                y = column,
                color = 'pink')

histplot(df_articles, 'product_group_name')

In [7]:
# best product by section

def best_prod_by_section(df:pd.DataFrame, section:str):
    
    if section not in df['section_name'].tolist():
        print("non-existent section")
        
    else:
        df = df[df['section_name'] == section].groupby('prod_name').count()['article_id'].reset_index(name = 'count').sort_values(['count'], ascending = False).head(5)
         
        color_lst = ['lavender', 'ghostwhite', 'royalblue', 'cornflowerblue', 'lightsteelblue']
        df.plot.pie(title = 'Best product by section',y = 'count',figsize = (10, 7), fontsize=10,
                     labels = df['prod_name'], colors = color_lst,
                     wedgeprops = {'linewidth' : 1.5, 'edgecolor' : 'white'}, autopct = '%1.0f%%')
        plt.legend(bbox_to_anchor = (1.02, 1), loc='upper left', borderaxespad = 0) 
        

In [8]:
best_prod_by_section(df_articles, 'Womens Everyday Basics')

# Customer

In [9]:
df_customer = pd.read_csv(customers)
df_customer.head()

In [10]:
# age distribution

import plotly.express as px

def distribution(value:pd.Series):
    result = value.value_counts()
    
    fig = px.bar(result, title="Age of customers")
    fig.update_layout(
        xaxis_title = "Age",
        yaxis_title = "Frequency",
        title_x = 0.5, 
        showlegend = False
    )
    fig.show()

In [11]:
distribution(df_customer['age'])

In [12]:
# Categorize Age

df_customer[df_customer['age'].isnull()] # check null value

In [13]:
bins= [0,20,30,40,50,60,70,100]
labels = ['10','20','30','40','50','60','over 70']

df_customer['age_group'] = pd.cut(df_customer['age'], bins=bins, labels=labels, right=False)
df_customer['age_group'] = df_customer['age_group'].cat.add_categories('unknown').fillna('unknown')

sns.countplot(x='age_group',data=df_customer)

In [14]:
# club_member_status by age

club_member_status_age = df_customer.groupby(['age_group', 'club_member_status']).count()['customer_id'].reset_index()
pivot_df = club_member_status_age.pivot(index = 'age_group', columns = 'club_member_status', values = 'customer_id')
pivot_df.plot(kind="bar")
plt.title("Club Member Status by Age")
plt.xlabel("Age Group")
plt.ylabel("Club Member Status")

In [15]:
# club member status

sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(10,5))
ax = sns.histplot(data=df_customer, x='club_member_status', color='red')
ax.set_xlabel('Club Member Status')
plt.show()

In [18]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def ratio_by_age(df:pd.DataFrame, column:str):
    
    column_category = df[column].unique().tolist()
    labels = df['age_group'].unique().tolist()
    fig = make_subplots(rows = 1, cols = len(column_category), specs = [[{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])
    
    value_lst = []
    for i in range(len(column_category)):
        value = df[df[column] == column_category[i]]['count'].tolist()
        value_lst.append(value)
        
    for j in range(1, len(value_lst)+1):
        result = fig.add_trace(go.Pie(labels=labels, values=value_lst[j-1], name=column_category[j-1]), 1, j)
        
    fig.update_traces(hole=.4, hoverinfo="label+percent+name")
    fig.update_layout(
        title_text = '{} by Age Group'.format(column)
    )
    
    return result

In [19]:
# club member status : Active by age

age_ratio_by_status = df_customer.groupby(['club_member_status', 'age_group']).count()['customer_id'].reset_index(name = 'count')
ratio_by_age(age_ratio_by_status, 'club_member_status')

In [20]:
# NONE => None

fashion_news_frequency_lst = df_customer['fashion_news_frequency'].tolist()
for i in range(len(fashion_news_frequency_lst)):
    if fashion_news_frequency_lst[i] == 'NONE':
        fashion_news_frequency_lst[i] = fashion_news_frequency_lst[i].replace('NONE', 'None')
df_customer['fashion_news_frequency'] = fashion_news_frequency_lst

In [21]:
age_ratio_by_frequency = df_customer[df_customer['fashion_news_frequency'].isin(['None', 'Regularly', 'Monthly'])].groupby(['fashion_news_frequency', 'age_group']).count()['customer_id'].reset_index(name = 'count')
ratio_by_age(age_ratio_by_frequency, 'fashion_news_frequency')