In [2]:
import pandas as pd
import re
import numpy as np

In [3]:
df = pd.read_pickle('ski_pickle3.pkl')

In [4]:
df.shape

(542, 6)

In [5]:
df.columns

Index(['Claimed Weight', 'Core', 'Dimensions', 'Length', 'Price',
       'Turn Radius'],
      dtype='object')

In [6]:
df.get_dtype_counts()

float64    1
object     5
dtype: int64

In [7]:
df.isnull().sum()

Claimed Weight    28
Core               8
Dimensions         4
Length            17
Price              0
Turn Radius        3
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.reset_index(drop=True, inplace=True)

In [10]:
df.shape

(494, 6)

In [18]:
df

Unnamed: 0,Claimed Weight,Core,Dimensions,Length,Price,Turn Radius
0,2lb 14oz,balsa wood,141 / 112 / 128mm,168.0,$879.20,15m
1,6lb 12oz,"aspen wood, carbon fiberglass","126 / 96 / 108mm,",163.0,$486.46,"17m,"
2,6lb 12oz,"aspen wood, carbon fiberglass",135 / 102 / 115mm,188.0,$486.46,20m
3,"4 lb 6 oz,","paulownia, Carbon Drive Technology",116 / 85 / 99.5 mm,164.0,$419.97,19 m
4,4 lb 12 oz,"paulownia, Carbon Drive Technology",116 / 85 / 99.5 mm,171.0,$419.97,19 m
5,4lb 5oz,"carbon, wood","110 / 78 / 99.5mm,",151.0,$524.95,"14m,"
6,4lb 5oz,"carbon, wood","112 / 78 / 101.5mm,",163.0,$524.95,"16m,"
7,4lb 5oz,"carbon, wood","113 / 78 / 102.5mm,",169.0,$524.95,"17m,"
8,5lb 12oz,"paulownia wood, carbon",126 / 96 / 114mm,176.0,$489.96,21m
9,6lb 6oz,"3D-shaped wood (poplar, paulownia), triaxial c...",130 / 101 / 118mm,175.0,$561.71,14 - 19m


In [25]:
#Functions for converting string to usable data
def weight_convert(claimed_weight):
    try:
        weight = 0
        weight += float(re.search(r'(\d\d|\d)[^\d]*?lb', claimed_weight).group(1))
        #print(weight)
        oz_string = re.search(r'(\d+).*?lb.*?([\d.]+).*?oz', claimed_weight)
        if oz_string :
            weight += (float(oz_string.group(2)) / 16)
        if weight < 15:
            return weight
        else:
            return None
    except:
        return None

def radius_convert(radius_string):
    #print(radius_string)
    try:
        numbers_list = re.split('[^\d\w\s.]', radius_string)
        #print(numbers_list)
        clean_numbers = [re.sub(r'[^\d.]', '', number) for number in numbers_list]
        #print(clean_numbers)
        radii = list(filter(lambda _ : len(_) > 0 and float(_) < 100, clean_numbers))
        float_radii = [float(radius) for radius in radii]
        avg_radius = np.mean(np.asarray(float_radii))
        if avg_radius < 40:
            return np.mean(np.asarray(float_radii))
        else:
            return None
    except:
        return None

def get_tip(dim_string):
    try:
        numbers_list = re.split('[-,/]', dim_string)
        just_numbers = [re.sub(r'[^\d.]', '', number) for number in numbers_list]
        clean_numbers = [number if len(number) <= 3 else number[-3:] for number in just_numbers]
        if float(clean_numbers[0]) > 50:
            return float(clean_numbers[0])
        else:
            return None
    except:
        return None

def get_underfoot(dim_string):
    try:
        numbers_list = re.split('[-,/]', dim_string)
        just_numbers = [re.sub(r'[^\d.]', '', number) for number in numbers_list]
        clean_numbers = [number if len(number) <= 3 else number[-3:] for number in just_numbers]
        if float(clean_numbers[1]) > 50:
            return float(clean_numbers[1])
        else:
            return None
    except:
        return None
    
def get_tail(dim_string):
    try:
        numbers_list = re.split('[-,/]', dim_string)
        just_numbers = [re.sub(r'[^\d.]', '', number) for number in numbers_list]
        clean_numbers = [number if len(number) <= 3 else number[-3:] for number in just_numbers]
        if float(clean_numbers[2]) > 50:
            return float(clean_numbers[2])
        else:
            return None
    except: 
        return None 

def price_convert(price_string):
    price_numbers = re.sub(r'[^\d.]', '', price_string)
    try:
        return float(price_numbers) 
    except:
        return None

def core_convert(core_string):
    cores = re.split(r',', core_string)
    cores = [core.strip() for core in cores]
    return cores
    


In [38]:
df_clean = df.copy()
df_clean['Claimed_Weight'] = df_clean['Claimed Weight'].apply(weight_convert)
df_clean['Real_Price'] = df_clean['Price'].apply(price_convert)
df_clean['Turn_Radius'] = df_clean['Turn Radius'].apply(radius_convert)
df_clean['Tip_Width'] = df_clean['Dimensions'].apply(get_tip)
df_clean['Width_Underfoot'] = df_clean['Dimensions'].apply(get_underfoot)
df_clean['Tail_Width'] = df_clean['Dimensions'].apply(get_tail)

df_clean = df_clean.drop(['Claimed Weight', 'Price', 'Turn Radius', 'Dimensions'], axis = 1)

  out=out, **kwargs)


In [54]:
#Almost all the skis have one or more of these

master_core_list = ['carbon', 'glass', 'kevlar', 'flax', 'poplar', 'ash', 'beech', 'wood', 'paulownia',
                    'balsa', 'aspen', 'maple', 'caruba', 'titanium', 'aluminum', 'bamboo', 'titanal']
cores_dict = {}
for core in master_core_list:
    cores_dict[core] = [1 if core in core_string.lower() else 0 for core_string in df_clean['Core']]

In [64]:
for core in cores_dict:
    print(core)
    print(np.asarray(cores_dict[core]).sum())

carbon
206
glass
120
kevlar
15
flax
11
poplar
198
ash
32
beech
57
wood
228
paulownia
120
balsa
26
aspen
49
maple
26
caruba
5
titanium
7
aluminum
16
bamboo
39
titanal
76


In [61]:
print(df_clean.shape)
print(cores_df.shape)

(494, 8)
(494, 17)


In [None]:
cores_df = pd.DataFrame(cores_dict)
cores_df

In [63]:
df_clean = pd.concat([df_clean, cores_df], axis=1)
df_clean.head()

Unnamed: 0,Core,Length,Claimed_Weight,Real_Price,Turn_Radius,Tip_Width,Width_Underfoot,Tail_Width,carbon,glass,...,wood,paulownia,balsa,aspen,maple,caruba,titanium,aluminum,bamboo,titanal
0,balsa wood,168.0,2.875,879.2,15.0,141.0,112.0,128.0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,"aspen wood, carbon fiberglass",163.0,6.75,486.46,17.0,126.0,96.0,108.0,1,1,...,1,0,0,1,0,0,0,0,0,0
2,"aspen wood, carbon fiberglass",188.0,6.75,486.46,20.0,135.0,102.0,115.0,1,1,...,1,0,0,1,0,0,0,0,0,0
3,"paulownia, Carbon Drive Technology",164.0,4.375,419.97,19.0,116.0,85.0,,1,0,...,0,1,0,0,0,0,0,0,0,0
4,"paulownia, Carbon Drive Technology",171.0,4.75,419.97,19.0,116.0,85.0,,1,0,...,0,1,0,0,0,0,0,0,0,0


In [69]:
df_clean.iloc[162]

Core               caruba
Length                188
Claimed_Weight          6
Real_Price         948.95
Turn_Radius          24.3
Tip_Width             128
Width_Underfoot        98
Tail_Width            117
carbon                  0
glass                   0
kevlar                  0
flax                    0
poplar                  0
ash                     0
beech                   0
wood                    0
paulownia               0
balsa                   0
aspen                   0
maple                   0
caruba                  1
titanium                0
aluminum                0
bamboo                  0
titanal                 0
Name: 162, dtype: object

In [70]:
df_clean.isnull().sum()

Core                0
Length              0
Claimed_Weight      1
Real_Price          0
Turn_Radius         3
Tip_Width           9
Width_Underfoot     2
Tail_Width         19
carbon              0
glass               0
kevlar              0
flax                0
poplar              0
ash                 0
beech               0
wood                0
paulownia           0
balsa               0
aspen               0
maple               0
caruba              0
titanium            0
aluminum            0
bamboo              0
titanal             0
dtype: int64

In [71]:
df_clean = df_clean.dropna()

In [72]:
df_clean.shape

(467, 25)

In [73]:
df_clean.to_pickle('ski_pickle_clean.pkl')