In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud
from wordcloud import STOPWORDS
#using data from https://www.kaggle.com/datasets/manojvarmalakkamraju/lego-setss
#can be found on https://www.kaggle.com/code/malhardata/lego-sets-eda
df = pd.read_csv('../input/lego-setss/lego_sets.csv')
df.head(10)

In [None]:
#Basic info first
print(df.info)
df.dtypes

In [None]:
#AVERAGES ie Mean values, mode, and standard deviation
cat_list = ['ages', 'review_difficulty','theme_name', 'country']
num_list = ['list_price', 'num_reviews', 'piece_count', 'play_star_rating', 'star_rating', 'val_star_rating']

print("Mean Values")
print(df[num_list].mean())
print("\nStandard Deviations")
print(df[num_list].std())
print("\nMost Common Categoricals")
print(df[cat_list].mode().T)
data = df[num_list].mean()
df[['piece_count','list_price']].mean().plot.bar()
plt.figure()
df[['play_star_rating', 'star_rating', 'val_star_rating']].mean().plot.bar(color = 'red')

In [None]:
#Number of Lego sets by Country
data=df['country'].value_counts()
print(data)
sns.barplot(data.index, data.values,palette='viridis')

In [None]:
#Insert a column to show average part price, and average number of parts per dollar
df['av_part_price'] = df['list_price']/df['piece_count']
df['parts_per_dollar'] = df['piece_count']/df['list_price']

In [None]:
#Average values for each Country
data=df.groupby('country').mean()
data = pd.DataFrame(data)
data = data.drop(columns = ['prod_id'])
fig = plt.figure(figsize = (20,10))
i = 1
for c in data.columns:
    data = data.sort_values(by = c, ascending = 0)
    sub = fig.add_subplot(3,3,i)
    sub.set_xlabel(c)
    sub.set_ylim((0.999 * data[c].min()), (1.001* data[c].max()))
    sub.tick_params(labelrotation=90)
    sns.barplot(x =data.index, y = data[c], palette = 'bright')
    i += 1
fig.tight_layout()

In [None]:
#Number of each theme
data=df['theme_name'].value_counts()
print(data)
plt.figure(figsize = (10,10))
plt.pie(data.head(10), labels = data.head(10).index, colors=sns.color_palette('Pastel2'))
plt.figure(figsize = (30,10))
plt.xticks(rotation = 90)
sns.barplot(data.index, data.values,palette='coolwarm')



In [None]:
#Average values for each theme
data=df.groupby('theme_name').mean()
data = pd.DataFrame(data)
data = data.drop(columns = ['prod_id'])
fig = plt.figure(figsize = (30,20))
i = 1
for c in data.columns:
    data = data.sort_values(by = c, ascending = 0).head(10)
    sub = fig.add_subplot(3,3,i)
    sub.set_xlabel(c)
    sub.tick_params(labelrotation=90)
    sns.barplot(x =data.index, y = data[c], palette = 'rainbow')
    i += 1
plt.tight_layout()

In [None]:
#Graph of piece count with cost of set
sns.regplot(x = df['piece_count'], y = df['list_price'], color = 'pink')

In [None]:
#Review difficulty by piece count
sns.barplot(x = df['review_difficulty'], y = df['piece_count'], order = ('Very Easy','Easy','Average','Challenging'), palette = 'copper')

In [None]:
#Wordclouds for both product descriptions
#a) convert descriptions to strings
longlist = df['prod_long_desc'].to_string()
shortlist = df['prod_desc'].to_string()
#b) make one long string featuring every word
long_str = ""
short_str = ""
for s in longlist:
    long_str += s + ""
for s in shortlist:
    short_str += s + ""
#c)Clean strings to remove unwanted characters
long_str = re.sub('\W+', " ", long_str).upper()
short_str = re.sub('\W+', " ", short_str).upper()
long_list = long_str.split()
short_list = short_str.split()
#d)convert strings to dictionaries, with frequencies of words
long_dict = {}
for w in long_list:
    if w in long_dict:
        long_dict[w] += 1
    else:
        long_dict[w] = 1
short_dict = {}
for w in short_list:
    if w in short_dict:
        short_dict[w] += 1
    else:
        short_dict[w] = 1
#e)now we generate wordclouds from the result
#first, since stopwords don't work in generate_from_frequencies, we need to modify our dictionaries
stop = ['AND','AT','~', 'S']
for w in STOPWORDS:
    w=w.upper()
    stop.append(w)
for w in stop:
    if w in long_dict:
        del long_dict[w]
    elif w in short_dict:
        del short_dict[w]

wc = WordCloud(background_color="white", colormap = 'prism')
wc.generate_from_frequencies(long_dict)
plt.figure(figsize = (20,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.title('Short Description')
plt.show()
wc = WordCloud(background_color="black", colormap = 'Spectral')
wc.generate_from_frequencies(short_dict)
plt.figure(figsize = (20,15))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.title('Long Description')
plt.show()