#**4.1 Importing required Liabraries**

In [1]:
import numpy as np
import pandas as pd
from io import StringIO


### **4.1.2 Specifying data**.

In [2]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''


# ***4.2 Reading csv data and converting data into *numpy arrays*. By using '''.value''' method***.

```
# .value
```



In [3]:
df = pd.read_csv(StringIO(csv_data))
df.values
#  df.isnull()

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

# ***4.3 Eliminating missing values from our array using***

```
# .dropna(axis = 0)
```
method.


In [4]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


###***4.3.1 Handling categorical datas.***

**Categorical data encoding with panda**

In [6]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])
df

Unnamed: 0,0,1,2,3
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


###**4.3.2 We can label the columns by using .columns function**

```
# df.columns = ['color','Size','Price','classlabel']
```

In [7]:
df.columns = ['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


### ***4.3.3 Mapping ordinal features***

In [8]:
size_mapping = {"XL":3,
                "L":2,
                "M":1}
size_mapping

{'XL': 3, 'L': 2, 'M': 1}

In [9]:
df['size']= df['size'].map(size_mapping)
df
 

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


### ***4.3.4 Encoding class labels***

- First, we use `np.unique` to get all the unique values in the given column.
- Then, we create a dictionary `class_mapping`, with the `label` as the key, and the `idx` as the value.
- The `enumerate` function gives us an index along with the element. 

For example, `enumerate(['a','b','c'])` would give us `(0, 'a'), (1, 'b'), (2, 'c')`.

So, if we have `['a','b','c']` and `[1,2,3]`, `dict(zip(['a','b','c'], [1,2,3]))` would give us `{'a':1, 'b':2, 'c':3}`.


In [10]:
class_mapping = {label: idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [11]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


***Here, we are reversing the classlabel to its original string.***

In [12]:
inv_class_mapping = {idx: label for label, idx in class_mapping.items()}
inv_class_mapping
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


 ***Performing one hot encoding on nominal features***

In [13]:
from sklearn.preprocessing import LabelEncoder
x =df[['color','size','price']].values
df
color_le = LabelEncoder()
x[:,0] = color_le.fit_transform(x[:,0])
x

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

#***4.4 Partioning the data into training and testing data.***

***Getting the data***

In [14]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

# print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


***Perform the actual split***


1.   The first line imports the train_test_split module from the sklearn.model_selection library.
2. The second line assigns the values in the first and second columns of the df_wine DataFrame to the x and y variables, respectively.
3.   The third line uses the train_test_split function to split the x and y variables into training and testing sets. The training set comprises 70% of the data, and the testing set comprises 30% of the data. The test_size parameter specifies the proportion of the data to be included in the testing set. The random_state parameter is used to seed the random number generator, and the stratify parameter ensures that the training and testing sets contain an equal proportion of labels.
4.   The fourth line assigns the training and testing sets to the x_train, x_test, y_train, and y_test variables, respectively.

> 



In [19]:
from sklearn.model_selection import train_test_split
x,y = df_wine.iloc[:,1:].values,df_wine.iloc[:, 0].values
x_train,x_test,y_train,y_test = \
train_test_split(x,y,test_size = 0.3,random_state = 0,stratify=y)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3])