# Panda


In [15]:
import pandas as pd
from io import StringIO
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
print(df)
#here we briefly created a csv which containn some missing value
#for bigger dataset we could use is.null().sum()
print(df.isnull().sum())
print(df.values)

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN
A    0
B    0
C    1
D    1
dtype: int64
[[ 1.  2.  3.  4.]
 [ 5.  6. nan  8.]
 [10. 11. 12. nan]]


# <H1> IMPUTING 

In [14]:
#one of the easiest way is to delete the row with missing element. with drop not a number dropna
df.dropna(axis=0)
#or
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [18]:

#Often the remove of the row and column is not the best choice, we can imputing missing value
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean') #impute the mean value of the column
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
print(imputed_data)

#Alternative way is use pandas
pdfill = df.fillna(df.mean())
print(pdfill)

[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]
      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   7.5  8.0
2  10.0  11.0  12.0  6.0


# Handling categorical data with pandas

There exist 2 type of features, ordinal and nominal. ordinal can be ordered.
nominal dont imply any order.

In [42]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])

df.columns = ['color', 'size', 'price', 'classlabel']   #color features doesnt imply any order
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


In order to interprets the ordinal features correctly, we need to convert categoritcal string into integers. we have to do this manually.

In [43]:
size_mapping = {'XL': 3,
                'L': 2,
                'M' : 1}

df['size'] = df['size'].map(size_mapping)
print(df)

   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2


il mapping è numerico perchè costa meno computazione e spazio di memoria, poi definiamo la funzione inversa.

In [46]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size']=df['size'].map(inv_size_mapping)
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


# Encoding class labels

Many scikit library encode the class labels into integer, to avoid technical glitches it is good practive to provide class labels as integer arrays.

In [47]:
import numpy as np
class_mapping = {label: idx for idx, label in
                 enumerate(np.unique(df['classlabel']))}
inv_class_mapping = {v: k for k, v in class_mapping.items()}
#Here we go again
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
print("reverse\n", df)

   color size  price  classlabel
0  green    M   10.1           1
1    red    L   13.5           0
2   blue   XL   15.3           1
reverse
    color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


Scikit already offer something

In [50]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
print(y)
inv_y = class_le.inverse_transform(y)
print(inv_y)

[1 0 1]
['class2' 'class1' 'class2']


In [53]:
X = df[['color', 'size', 'price']].values
print(X)
color_le = LabelEncoder()
X[: ,0] = color_le.fit_transform(X[:, 0])
print(X)

[['green' 'M' 10.1]
 ['red' 'L' 13.5]
 ['blue' 'XL' 15.3]]
[[1 'M' 10.1]
 [2 'L' 13.5]
 [0 'XL' 15.3]]


We have a big problem, now the models assume green > blue, red > green, this assumption is incorrect, a classifier can still produce useful results and this is the problem. Thats why now we are going to use one hot encoding.
The idea behind is to create new dummy feature for each unique value in the nominal feature column.
blue can be encoded as blue = 1, green = 0, red = 0.

In [59]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
result_x = color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()
print(result_x)


[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
