In [None]:
from typing import Union, Optional
import numpy as np
import pandas as pd
from sklearn import datasets
%matplotlib widget
import matplotlib as mpl
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from NiaPy.util import groupdatabylabel
# Dodatne opcije
jtplot.style()
# sns.set_style("whitegrid"), sns.set_context("talk")
pd.set_option('display.max_columns', None)

In [None]:
def getdims(d: Union[np.ndarray, list], n: int = 0) -> None:
   for i, e in enumerate(d):
      if isinstance(e, (list, np.ndarray)): print ('%d:%d -> %d' % (n, i, len(e)))
      else: continue
      if isinstance(e[0], (list, np.ndarray)) and isinstance(e[0][0], (list, np.ndarray)): getdims(e, n + 1)
		 
def mplot(data: pd.DataFrame, clabel: str, ax: Optional[mpl.axes.Axes] = None) -> None:
   d = pd.melt(data, clabel, var_name="measurement")
   if ax is None: f, ax = plt.subplots()
   sns.despine(bottom=True, left=True)
   sns.stripplot(x="value", y="measurement", hue=clabel, data=d, dodge=True, jitter=True, alpha=.25, zorder=1)
   sns.pointplot(x="value", y="measurement", hue=clabel, data=d, dodge=.532, join=False, palette="dark", markers="d", scale=.75, ci=None)
   handles, labels = ax.get_legend_handles_labels()
   noc = len(data[clabel].unique())
   ax.legend(handles[noc:], labels[noc:], title=clabel, handletextpad=0, columnspacing=1, loc="lower right", ncol=noc, frameon=True)

# Generated Data

## Example

In [None]:
X, y = datasets.make_blobs()
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
for e in np.unique(y): ax.scatter(X[np.where(y == e), 0], X[np.where(y == e), 1], label='Class %d' % e)
ax.set_xlabel('$a_1$'); ax.set_ylabel('$a_2$')
ax.legend()
fig.tight_layout()
fig.savefig("clusterExample.pdf", bbox_inches='tight')

In [None]:
gdata = pd.DataFrame(np.hstack((X, y.reshape(len(y), 1))), columns=['a1', 'a2', 'Class'])
gdata

In [None]:
lt = LabelEncoder().fit(y)
lt.classes_
d = groupdatabylabel(X, y, lt)
getdims(d)

In [None]:
mplot(gdata, 'Class')

# The Iris Dataset

## Attribute description
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:

## Class attribute
* 0: Iris Setosa
* 1: Iris Versicolour
* 2: Iris Virginica

In [None]:
iris = datasets.load_iris()
X, y = iris.data, iris.target
iris = pd.DataFrame(np.hstack((X, y.reshape(len(y), 1))), columns=['Sepal length', 'Sepal width', 'Petal length', 'Petal width', 'Species'])
iris

In [None]:
with open('irisStat.tex', 'w') as file: file.write(iris.describe().to_latex())
iris.describe()

In [None]:
lt = LabelEncoder().fit(y)
lt.classes_

In [None]:
d = groupdatabylabel(X, y, lt)
getdims(d)

In [None]:
mplot(iris, 'Species')

# The Cancer Wisconsin Dataset

## Attribute information
1. ID number
2. Diagnosis (M = malignant, B = benign)
3. Ten real-valued features are computed for each cell nucleus:
  1. radius (mean of distances from center to points on the perimeter)
  2. texture (standard deviation of gray-scale values)
  3. perimeter
  4. area
  5. smoothness (local variation in radius lengths)
  6. compactness (perimeter^2 / area - 1.0)
  7. concavity (severity of concave portions of the contour)
  8. concave points (number of concave portions of the contour)
  9. symmetry 
  10. fractal dimension ("coastline approximation" - 1)

## Class attirbute
* 0: No cancer
* 1: Cancer

In [None]:
cwd = datasets.load_breast_cancer()
X, y = cwd.data, cwd.target
columns = ['radius 1', 'texture 1', 'perimeter 1', 'area 1', 'smoothness 1', 'compactness 1', 'concavity 1', 'concave points 1', 'symmetry 1', 'fractal dimension 1', 'radius 2', 'texture 2', 'perimeter 2', 'area 2', 'smoothness 2', 'compactness 2', 'concavity 2', 'concave points 2', 'symmetry 2', 'fractal dimension 2', 'radius 3', 'texture 3', 'perimeter 3', 'area 3', 'smoothness 3', 'compactness 3', 'concavity 3', 'concave points 3', 'symmetry 3', 'fractal dimension 3', 'Diagnosis']
cwd = pd.DataFrame(np.hstack((X, y.reshape(len(y), 1))), columns=columns)
cwd

In [None]:
with open('cancerStat.tex', 'w') as file: file.write(cwd.describe().to_latex())
cwd.describe()

In [None]:
lt = LabelEncoder().fit(y)
lt.classes_

In [None]:
d = groupdatabylabel(X, y, lt)
getdims(d)

In [None]:
mplot(cwd, 'Diagnosis')

# The Wine Dataset

## Attribute description
1. Alcohol
2. Malic acid
3. Ash
4. Alcalinity of ash
5. Magnesium
6. Total phenols
7. Flavanoids
8. Nonflavanoid phenols
9. Proanthocyanins
10. Color intensity
11. Hue
12. OD280/OD315 of diluted wines
13. Proline 

## Class attiribute
* 0: TODO
* 1: TODO
* 2: TODO

In [None]:
wine = datasets.load_wine()
X, y = wine.data, wine.target
wine = pd.DataFrame(np.hstack((X, y.reshape(len(y), 1))), columns=['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline', 'Class'])
wine

In [None]:
with open('wineStat.tex', 'w') as file: file.write(wine.describe().to_latex())
wine.describe()

In [None]:
lt = LabelEncoder().fit(y)
lt.classes_

In [None]:
d = groupdatabylabel(X, y, lt)
getdims(d)

In [None]:
mplot(wine, 'Class')

# The Glass Dataset

## Attribute information
1. RI: refractive index
2. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
3. Mg: Magnesium
4. Al: Aluminum
5. Si: Silicon
6. K: Potassium
7. Ca: Calcium
8. Ba: Barium
9. Fe: Iron
10. Type of glass: (class attribute)

## Class attribute
Type of glass:
* 1: building_windows_float_processed
* 2: building_windows_non_float_processed
* 3: vehicle_windows_float_processed
* 4: vehicle_windows_non_float_processed (none in this database)
* 5: containers
* 6: tableware
* 7: headlamps

In [None]:
glass = pd.read_csv('glass.csv')
glass

In [None]:
with open('glassStat.tex', 'w') as file: file.write(glass.describe().to_latex())
glass.describe()

In [None]:
lt = LabelEncoder().fit(glass.iloc[:, -1].values)
lt.classes_

In [None]:
d = groupdatabylabel(glass.iloc[:, :-1].values, glass.iloc[:, -1].values, lt)
getdims(d)

In [None]:
mplot(glass, 'Type')

# The Contraceptive Method Choice Dataset

## Attribute information
1. Wife's age                     (numerical)
2. Wife's education               (categorical)      1=low, 2, 3, 4=high
3. Husband's education            (categorical)      1=low, 2, 3, 4=high
4. Number of children ever born   (numerical)
5. Wife's religion                (binary)           0=Non-Islam, 1=Islam
6. Wife's now working?            (binary)           0=Yes, 1=No
7. Husband's occupation           (categorical)      1, 2, 3, 4
8. Standard-of-living index       (categorical)      1=low, 2, 3, 4=high
9. Media exposure                 (binary)           0=Good, 1=Not good
10. Contraceptive method used     (class attribute)

# Class attribute
Contraceptive method used:
* 1: No-use
* 2: Long-term
* 3: Short-term

In [None]:
cmc = pd.read_csv('cmc.csv')
cmc

In [None]:
with open('cmcStat.tex', 'w') as file: file.write(cmc.describe().to_latex())
cmc.describe()

In [None]:
lt = LabelEncoder().fit(cmc.iloc[:, -1])
lt.classes_

In [None]:
d = groupdatabylabel(cmc.iloc[:, :-1].values, cmc.iloc[:, -1].values, lt)
getdims(d)

In [None]:
mplot(cmc, ' Contraceptive method used')

# Covertype dataset

## Attribute information
1. Elevation / quantitative /meters / Elevation in meters
2. Aspect / quantitative / azimuth / Aspect in degrees azimuth
3. Slope / quantitative / degrees / Slope in degrees
4. Horizontal_Distance_To_Hydrology / quantitative / meters / Horz Dist to nearest surface water features
5. Vertical_Distance_To_Hydrology / quantitative / meters / Vert Dist to nearest surface water features
6. Horizontal_Distance_To_Roadways / quantitative / meters / Horz Dist to nearest roadway
7. Hillshade_9am / quantitative / 0 to 255 index / Hillshade index at 9am, summer solstice
8. Hillshade_Noon / quantitative / 0 to 255 index / Hillshade index at noon, summer soltice
9. Hillshade_3pm / quantitative / 0 to 255 index / Hillshade index at 3pm, summer solstice
10. Horizontal_Distance_To_Fire_Points / quantitative / meters / Horz Dist to nearest wildfire ignition points
11. Wilderness_Area (4 binary columns) / qualitative / 0 (absence) or 1 (presence) / Wilderness area designation
12. Soil_Type (40 binary columns) / qualitative / 0 (absence) or 1 (presence) / Soil Type designation

## Class attribute
* Cover_Type (7 types) / integer / 1 to 7 / Forest Cover Type designation

In [None]:
covtype = datasets.fetch_covtype()
X, y = covtype.data, covtype.target
covtype = pd.DataFrame(np.hstack((X, y.reshape(len(y), 1))), columns=['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Wilderness_Area_4', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39', 'Soil_Type_40', 'Cover_Type'])
covtype

In [None]:
with open('covtypeStat.tex', 'w') as file: file.write(covtype.describe().to_latex())
covtype.describe()

In [None]:
lt = LabelEncoder().fit(y)
lt.classes_

In [None]:
d = groupdatabylabel(X, y, lt)
getdims(d)

In [None]:
mplot(covtype, 'Cover_Type')