## Data Exploration

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# import data (with photo)
four_star = pd.read_csv('data/patterns_4star.csv')
five_star_clothing = pd.read_csv('data/patterns_5star_is_clothing.csv')
five_star_not_clothing = pd.read_csv('data/patterns_5star_not_clothing.csv')

In [4]:
four_star.shape, five_star_clothing.shape, five_star_not_clothing.shape

((79030, 24), (24583, 24), (85911, 24))

In [5]:
# colsolidate into one dataframe
df = four_star.append(five_star_clothing, ignore_index=True)
df = df.append(five_star_not_clothing, ignore_index=True)
df.shape



(189524, 24)

In [6]:
# remove any rows with headers
df = df.drop(df[df['pattern_id']=='pattern_id'].index)

In [7]:
# drop douplicates
df = df.drop_duplicates()

In [8]:
# replace any "None" will Nan
df = df.replace("None", None) 
# not sure if there are any, but want to make sure none there 

In [None]:
# remove any rows with too many missing values

# plot nulls percentages and nones

In [70]:
# df. = pd.to_numeric(df['favorites_count'])
# df. = pd.to_numeric(df['projects_count'])

df[["favorites_count", 
    "projects_count", 
    'difficulty_average',
    'difficulty_count', 
    'rating_average', 
    'queued_projects_count',
    'rating_count',
    'yardage_max',
    'yardage',
    'gauge_divisor',
    'gauge']] = df[["favorites_count",
                      "projects_count",
                      'difficulty_average',
                      'difficulty_count',
                      'rating_average',
                      'queued_projects_count',
                      'rating_count',
                      'yardage_max',
                      'yardage',
                      'gauge_divisor',
                      'gauge']].apply(pd.to_numeric)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188158 entries, 0 to 189523
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   pattern_id               188158 non-null  object 
 1   name                     188158 non-null  object 
 2   name_permalink           188158 non-null  object 
 3   favorites_count          188158 non-null  int64  
 4   projects_count           188158 non-null  int64  
 5   difficulty_average       188158 non-null  float64
 6   difficulty_count         182439 non-null  float64
 7   rating_average           188158 non-null  float64
 8   queued_projects_count    188158 non-null  int64  
 9   rating_count             188156 non-null  float64
 10  pattern_type_names       188154 non-null  object 
 11  pattern_type_clothing    188154 non-null  object 
 12  photos_url               188158 non-null  object 
 13  pattern_needle_sizes     188158 non-null  object 
 14  patt

In [69]:
df.gauge

0         20.0
1         32.0
2          NaN
3         22.0
4         30.0
          ... 
189519    22.0
189520    21.0
189521    11.0
189522    22.0
189523    10.0
Name: gauge, Length: 188158, dtype: object

In [44]:
df.isnull().sum()
# NOTE they are all objects, not numeric - will need to convert

pattern_id                     0
name                           0
name_permalink                 0
favorites_count                0
projects_count                 0
difficulty_average             0
difficulty_count            5719
rating_average                 0
queued_projects_count          0
rating_count                   2
pattern_type_names             4
pattern_type_clothing          4
photos_url                     0
pattern_needle_sizes           0
pattern_attributes             0
yardage_max                70566
yardage                    40425
generally_available            0
gauge                      33039
gauge_divisor              23473
free                           7
downloadable                   4
categories                     6
yarn_weight_description     7330
category_main                  6
category_secondary             6
dtype: int64

In [73]:
df[df['free'].isna()]


Unnamed: 0,pattern_id,name,name_permalink,favorites_count,projects_count,difficulty_average,difficulty_count,rating_average,queued_projects_count,rating_count,...,yardage,generally_available,gauge,gauge_divisor,free,downloadable,categories,yarn_weight_description,category_main,category_secondary
17646,366,Union Square Market Pullover,union-square-market-pullover,1241,100,4.8,50.0,4.318182,412,44.0,...,1782.0,2005/08/01 00:00:00 -0400,27.0,4.0,,,"['pullover', 'sweater', 'clothing']",Fingering (14 wpi),"['pullover', 'sweater', 'clothing']","['pullover', 'sweater', 'clothing']"
49005,175965,on-the-go bike basket purse,on-the-go-bike-basket-purse,533,35,3.909091,11.0,4.444444,86,9.0,...,,2010/04/01 00:00:00 -0400,,,,False,"['other-bag', 'bag', 'accessories']",,"['other-bag', 'bag', 'accessories']","['other-bag', 'bag', 'accessories']"
63558,338742,Simple Elegance Wrap,simple-elegance-wrap,42,17,3.0,6.0,4.0,7,6.0,...,660.0,2012/08/12 18:26:21 -0400,6.5,1.0,,,"['shawl-wrap', 'neck-torso', 'accessories']",,"['shawl-wrap', 'neck-torso', 'accessories']","['shawl-wrap', 'neck-torso', 'accessories']"
132998,454740,Puffy Cable Cowl,puffy-cable-cowl,1029,81,2.391304,23.0,4.583333,118,24.0,...,170.0,2013/12/01 00:00:00 -0500,30.0,4.0,,,"['cowl', 'neck-torso', 'accessories']",DK (11 wpi),"['cowl', 'neck-torso', 'accessories']","['cowl', 'neck-torso', 'accessories']"
164144,363584,The Forbidden Forest Cowl,the-forbidden-forest-cowl,44,7,1.833333,6.0,4.6,8,5.0,...,200.0,2012/11/01 00:00:00 -0400,5.5,1.0,,False,"['cowl', 'neck-torso', 'accessories']",Worsted (9 wpi),"['cowl', 'neck-torso', 'accessories']","['cowl', 'neck-torso', 'accessories']"
164189,364082,#14 Diamond Cable Hat,14-diamond-cable-hat,49,7,2.666667,3.0,4.5,7,4.0,...,218.0,2012/11/01 00:00:00 -0400,26.0,4.0,,,"['beanie-toque', 'hat', 'accessories']",,"['beanie-toque', 'hat', 'accessories']","['beanie-toque', 'hat', 'accessories']"
169942,458952,Candy Stripes Fingerless Mitts,candy-stripes-fingerless-mitts,185,13,3.333333,3.0,4.75,32,4.0,...,205.0,2013/12/01 00:00:00 -0500,36.0,4.0,,True,"['fingerless', 'hands', 'accessories']",Fingering (14 wpi),"['fingerless', 'hands', 'accessories']","['fingerless', 'hands', 'accessories']"


In [22]:
df.to_csv('data/consolidated_patterns.csv', index=False)

In [None]:
# UNIVERATATE AND BIVARIATE ANALYSIS! - PLOTS!

In [21]:
df.info()
# 188328
# 188158 after douplicate drop - uh oh

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188158 entries, 0 to 189523
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   pattern_id               188158 non-null  object
 1   name                     188158 non-null  object
 2   name_permalink           188158 non-null  object
 3   favorites_count          188158 non-null  object
 4   projects_count           188158 non-null  object
 5   difficulty_average       188158 non-null  object
 6   difficulty_count         182439 non-null  object
 7   rating_average           188158 non-null  object
 8   queued_projects_count    188158 non-null  object
 9   rating_count             188156 non-null  object
 10  pattern_type_names       188154 non-null  object
 11  pattern_type_clothing    188154 non-null  object
 12  photos_url               188158 non-null  object
 13  pattern_needle_sizes     188158 non-null  object
 14  pattern_attributes  

### From Ravelry Search page:
(too many to download them all)

In [5]:
knitting_patterns = 665638
crochet = 435382
machine_knitting = 4008
loom_knitting = 4141
all_patterns = 1099169 # this includes crochet, machine and loom knitting as well 

knitting_rated_unrated = 469537

# Choose only knittng and rated and with photo for project page
# plot rated patterns

In [None]:
# PLOT A PIE CHART

In [6]:
# Users
users = 10247493
users_within_5_miles_of_me = 2972
users_with_avatar = 712601

# filter has avatar (with the assumption they would be more active)

#### For knitting 

In [None]:
projects = 18113264

In [10]:
# yarn
yarn = df.groupby(['yarn_weight_description'], as_index = False).count().sort_values("pattern_id", ascending = False)
yarn

# # plot
# sns.set(font_scale=1.5)

# p = sns.displot(x=q1_length, bins=1000, kde=True).set(title='Distribution of Q1 Character length');
# p.set( xlabel = "Q1 Character Length");
# yarn['yarn_weight_description'].value_counts(sort=True).plot(kind='bar')

Unnamed: 0,yarn_weight_description,pattern_id,name,name_permalink,favorites_count,projects_count,difficulty_average,difficulty_count,rating_average,queued_projects_count,...,pattern_needle_sizes,pattern_attributes,yardage_max,yardage,generally_available,gauge,gauge_divisor,free,downloadable,categories
7,Fingering (14 wpi),44320,44320,44320,44320,44320,44320,42926,44320,44320,...,44320,44320,29887,36871,44320,38150,40381,44318,44319,44319
15,Worsted (9 wpi),33895,33895,33895,33895,33895,33895,32966,33895,33895,...,33895,33895,21210,26329,33895,27898,29143,33894,33895,33895
5,DK (11 wpi),32374,32374,32374,32374,32374,32374,31340,32374,32374,...,32374,32374,21177,25298,32374,28035,29484,32373,32373,32372
1,Aran (8 wpi),19499,19499,19499,19499,19499,19499,18941,19499,19499,...,19499,19499,12093,15816,19499,16933,17628,19499,19499,19499
12,Sport (12 wpi),15368,15368,15368,15368,15368,15368,14842,15368,15368,...,15368,15368,9926,12461,15368,13557,14239,15368,15368,15366
3,Bulky (7 wpi),11766,11766,11766,11766,11766,11766,11462,11766,11766,...,11766,11766,7390,9757,11766,10042,10445,11766,11766,11766
13,Super Bulky (5-6 wpi),7492,7492,7492,7492,7492,7492,7246,7492,7492,...,7492,7492,4456,6373,7492,6333,6749,7492,7492,7492
9,Lace,6970,6970,6970,6970,6970,6970,6759,6970,6970,...,6970,6970,4315,5938,6970,4953,5689,6970,6970,6970
10,Light Fingering,5802,5802,5802,5802,5802,5802,5606,5802,5802,...,5802,5802,4099,4938,5802,5035,5346,5802,5802,5802
0,Any gauge - designed for any gauge,1242,1242,1242,1242,1242,1242,1203,1242,1242,...,1242,1242,704,737,1242,277,900,1242,1242,1241


In [33]:
# pattern category
df['category_main'] = df['categories']
df['category_secondary'] = df['categories']
# category = df.groupby(['categories'], as_index = False).count()
# category

# NEED TO SPLIT THIS OUT 
df['category_main'] = df['category_main'].apply(lambda x: x[-1], None)

TypeError: 'float' object is not subscriptable

In [None]:
# ratings


In [None]:
counts

In [None]:
# favourites - look at all data for imputing 

In [21]:
# yardage - how long a project will take 
yardage = df[['yardage', 'yardage_description', 'yardage_max']]
print(yardage.yardage.unique())
print(yardage.yardage_max.unique())
yardage.yardage_description.unique()
yardage.head(30)

# FOR IMPUTING - FIND AVG YARDAGE FOR CATEGORY
# IMPUTE YARDAGE MAX WITH YARDAGE

# HARD CODE AVERAGES/CATEGORY - or median dependeing on distribution
Ave_yardage = (yardage_max - yardage)/2

[ 874.  800.  342.  388.  220.  500. 1162.   nan  114. 1150. 1568.  820.
  225.   77.  400.  875.  465.  780.  219.  490.  125.  840.  585. 1107.
  555.  420.  113.  121.  924.   61. 1540. 2300.  981.  681.  392.  360.
  545.  440.  760.  109.  880.  173.   93. 1160.  550.  645.  380.  369.
  277. 1098. 1116.  690.  436.  164. 1050.  275.  660.  280.  190.  720.
  450. 1480. 1200. 1000. 1582. 1330.  919.   90.  218.  350.  100.  110.
  150.]
[  nan 1200.  798. 1826. 1764.  372. 1248.  250. 1752.  765. 1353.  850.
  140.   90. 2420.  872.  700.  763.  880.  175.  346. 1392.  770. 1075.
  400.  608.  450. 3660. 1302.  490. 1100. 1476. 1300.  819.  525.  380.
  900. 1776. 1650. 1250. 2101. 1500. 2280. 1334.]


Unnamed: 0,yardage,yardage_description,yardage_max
0,874.0,874 yards,
1,800.0,800 - 1200 yards,1200.0
2,342.0,342 - 798 yards,798.0
3,388.0,388 yards,
4,220.0,220 yards,
5,500.0,500 yards,
6,1162.0,1162 - 1826 yards,1826.0
7,,yards,
8,114.0,114 yards,
9,1150.0,1150 yards,


In [26]:
# needle sizes
df.columns

Index(['pattern_id', 'name', 'name_permalink', 'favorites_count',
       'projects_count', 'difficulty_average', 'difficulty_count',
       'rating_average', 'rating_count', 'pattern_type_id',
       'pattern_type_names', 'pattern_type_clothing', 'photos_url', 'craft_id',
       'url', 'pattern_needle_sizes', 'pattern_attributes', 'yardage_max',
       'yardage', 'yardage_description', 'generally_available', 'published',
       'gauge', 'gauge_pattern', 'gauge_divisor', 'row_gauge', 'free',
       'downloadable', 'categories', 'yarn_weight_description',
       'yarn_weight_id', 'yarn_weight_name', 'yarn_weight_ply',
       'yarn_weight_wpi', 'yarn_weight_knit_gauge'],
      dtype='object')

## With data

In [None]:
# CLEAN!
plot missing values
# dropna()

In [None]:
# distribution of pattern counts

# averages


In [None]:
correlation matrix (favourites, take first or last category) after fill nas

In [None]:
BOX PLOTS!

In [None]:
# Number of ratings per book
data = df.groupby('ISBN')['bookRating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Book (Clipped at 100)',
                   xaxis = dict(title = 'Number of Ratings Per Book'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [None]:
# try this out (from https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b)

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = df['bookRating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} book-ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)