In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
color_df = pd.read_csv('data/colors.csv')
color_df.head()

In [None]:
color_df.shape

In [None]:
print(f'Number of Unique Colors: {color_df["name"].nunique()}')

In [None]:
# Number of opaque vs transparent (is_trans col)
print(color_df.groupby('is_trans')['name'].count())
print(f"Number of opaque blocks: {color_df.groupby('is_trans')['name'].count()[0]}")
print(f"Number of transparent blocks: {color_df.groupby('is_trans')['name'].count()[1]}")

In [None]:
# Alternative method to getting count (categorical)
color_df['is_trans'].value_counts()

### Lego THEMEs vs Lego SETS

Walk into a LEGO store and you will see their products organized by theme.
Theme includes:
* Star Wars
* Batman
* Harry Potter etc.


A Lego Set is a particular box of Lego or product. A single theme can have multiple sets

In [None]:
sets_df = pd.read_csv('data/sets.csv')
sets_df.head()

In [None]:
sets_df.shape

In [None]:
first_year_release = sets_df.sort_values('year').reset_index(drop=True)['year'][0]
print(f"Year of first LEGO set release: {first_year_release}")

In [None]:
for name in sets_df[sets_df['year'] == first_year_release]['name']:
    print(name)

In [None]:
# Top 5 Lego sets with most number of parts
sets_df.sort_values('num_parts', ascending=False).head()

### Number of Sets Published year-on-year

In [None]:
sets_by_year = sets_df.groupby('year').count()['name']

In [None]:
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot()

ax.plot(sets_by_year.index[:-2], sets_by_year[:-2])
ax.tick_params(axis='both', colors='white', labelsize=14, size=14)
plt.xlabel('Year', fontsize=14, color='white')
plt.ylabel('No. of Sets', fontsize=14, color='white')

### Pandas Aggregate Function

In [None]:
themes_by_year = sets_df.groupby('year').agg({
    'theme_id': pd.Series.nunique
})

In [None]:
themes_by_year.head()

In [None]:
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot()

ax.plot(themes_by_year[:-2].index, themes_by_year[:-2])
ax.tick_params(axis='both', colors='white', labelsize=14, size=14)
plt.xlabel('Year', fontsize=14, color='white')
plt.ylabel('No. of Unique Themes', fontsize=14, color='white')

### Superimposing Line Charts with Separate Axes
* i.e plotting No. of themes & No. of Sets in single plot
* Problem: No. of themes (0 to 90), No. of sets (0 to 900)
* Soln: Use 2 separate y axes, but in single plot

In [None]:
fig = plt.figure(figsize=(14, 10))
ax1 = plt.gca() # get current axes
ax2 = ax1.twinx() # share same x-axis, but different y-axis

ax1.tick_params(axis='both', colors='white', labelsize=14, size=14)
ax2.tick_params(axis='both', colors='white', labelsize=14, size=14)

ax1.plot(sets_by_year.index[:-2], sets_by_year[:-2], color='r', label='unique sets')
ax2.plot(themes_by_year.index[:-2], themes_by_year[:-2], color='b', label='unique themes')

ax1.set_xlabel('Year', color='white', fontsize=14)

ax1.set_ylabel('Unique Sets', color='r', fontsize=14)
ax2.set_ylabel('Unique Themes', color='b', fontsize=14)

plt.legend(fontsize="18")

### ScatterPlots

In [None]:
parts_pet_set = sets_df.groupby('year').agg({
    'num_parts': 'mean'
})
parts_pet_set.head()

In [None]:
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot()

ax.scatter(parts_pet_set.index[:-2], parts_pet_set[:-2])
ax.tick_params(axis='both', colors='white', labelsize=14, size=14)
plt.xlabel('Year', fontsize=14, color='white')
plt.ylabel('Avg Parts / set', fontsize=14, color='white')

In [None]:
sets_df['theme_id'].value_counts()

In [None]:
theme_df = pd.read_csv('data/themes.csv')
theme_df.head()

In [None]:
theme_df.shape

In [None]:
star_war_ids = list(theme_df[theme_df['name'] == 'Star Wars']['id'].values)
print(star_war_ids)

In [None]:
sets_df[sets_df['theme_id'].isin(star_war_ids)]

In [None]:
merge_df = pd.merge(sets_df, theme_df, left_on='theme_id', right_on='id')
merge_df.head()

In [None]:
# merge_df.drop(columns=['parent_id', 'id'], axis=1, inplace=True)
merge_df.rename(columns={
    "name_x": "set_name",
    "name_y": "theme_name"
}, inplace=True)
merge_df.head()

In [None]:
merge_df['theme_name'].value_counts()

In [None]:
merge_df['theme_name'].value_counts()

In [None]:
data_to_show = 10
theme_names = merge_df['theme_name'].value_counts().index[:data_to_show]
theme_counts = merge_df['theme_name'].value_counts().values[:data_to_show]

plt.figure(figsize=(14, 8))
ax = fig.add_subplot()

plt.bar(theme_names, theme_counts)

ax.tick_params(axis='both', colors='white', labelsize=14, size=14)
plt.xlabel('Set Names', fontsize=14, color='white')
plt.ylabel('No. of Sets', fontsize=14, color='white')
