In [None]:
import pandas as pd
import numpy as np
from io import StringIO

# Data Preprocessing

In [None]:
csv_data = {'a':[1.0,5.0,10.0],'b':[2.0,6.0,11.0],'c':[3.0,np.NaN,12.0],'d':[4.0,8.0,np.NaN]}
df = pd.DataFrame(csv_data)
df

## Handling Missing Data

In [None]:
# Identifying missing data
df.isnull().sum()

### Dropping Missing Data

In [None]:
df.dropna(axis=0) # Drops rows with at least one NaN
df.dropna(axis=1) # Drops columns with at least one NaN
df.dropna(how='all') # -< Drop rows where all are nan
df.dropna(thresh=4) # <- Drops rows that have fewer than 4 real values
df.dropna(subset=['c']) # <- Drops rows where NaN appears in a specific column

### Data Imputation

In [None]:
from sklearn.impute import SimpleImputer

''' Data Imputation by inserting the mean value for each column '''
imr = SimpleImputer(missing_values=np.NaN,strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)

imputed_data

In [None]:
''' Orrrrrr we can just do  '''
df.fillna(df.mean())

## Handling Categorical Data

In [None]:
df = pd.DataFrame(
    [
        ['green','M',10.1,'class2'],
        ['red','L',13.5,'class1'],
        ['blue','XL',15.3,'class2']
    ],
    columns=['color','size','price','classlabel']
)
df

### Mapping Ordinal Features

In [None]:
size_mapping = {k:v for k,v in zip(['XL','L','M'],range(3,0,-1))}
inv_size_mapping = {v:k for k,v in size_mapping.items()}

In [None]:
df['size'] = df['size'].map(size_mapping)
df

In [None]:
df['size'].map(inv_size_mapping)

### Encoding Class Labels

In [None]:
class_mapping = {v:k for k,v in enumerate(df.classlabel.unique())}
inv_class_mapping = {v:k for k,v in class_mapping.items()}
class_mapping
inv_class_mapping

In [None]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df.classlabel.values)
y

In [None]:
class_le.inverse_transform(y)

### One Hot Encoding
An issue that can arise when encoding nominal data is that classifiers might mistake you encoded values for one's of an ordinal nature. For example,\
if you have the labels Blue,Green,Red and encode them as 1,2,3, the classifier would interpret Blue as less than Green and Red as greater than Green.\
Unless this is what you want, a solution to this is one hot encoding, where a dummy boolean is set as shown below.

In [None]:
from sklearn.preprocessing import OneHotEncoder

X = df[['color','size','price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()


In [None]:
from sklearn.compose import ColumnTransformer

c_transf = ColumnTransformer(
    [
        ('onehot',OneHotEncoder(),[0]),
        ('nothing','passthrough',[1,2])
    ]
)
c_transf.fit_transform(X).astype(float)

In [None]:
pd.get_dummies(df[['color','size','price']])

In [None]:
#We can use get_dummies this way to one-hot encode all string columns and also reduce correlation among variables
#This is useful, especially in classifiers that use matrix inversion which can lead to unstable estimates
pd.get_dummies(df[['color','size','price']],drop_first=True)

In [None]:
#The same can be achieved using the OneHotEncoder as such 
color_ohe = OneHotEncoder(categories='auto',drop='first')

c_transf = ColumnTransformer(
    [
        ('onehot',color_ohe,[0]),
        ('nothing','passthrough',[1,2])
    ]
)
c_transf.fit_transform(X).astype(float)

## Test/Train Dataset Partitioning

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine_cols = ['Class label','Alcohol','Malic Acid','Ash','Alcalinity of ash',
             'Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols',
             'Proanthocyanins','Color intensity','Hue',
             'OD280/OD315 of diluted wines','Proline'
            ]
df_wine = pd.read_csv(url,header=None)
df_wine.columns = wine_cols
df_wine.head()


In [None]:
from sklearn.model_selection import train_test_split

X,y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=0,stratify=y)

## Feature Scaling

### Normalization using MinMax scaling, i.e putting all features into a range of 0,1

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

### Standardization (Centering means and making all values within 1 standard deviation)

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

## Meaningful Feature Selection

### L1 and L2 Regularization

L1 regularization aims to create sparse feature vectors where most feature weights are 0. This is useful on highly-dimensional datasets with many irrelevant features.
L2 regularization aims to reduce model complexity by penalizing large individual weights and decrease our model's dependency on training data and reduce overfitting (variance).

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1',solver='liblinear',multi_class='ovr')
lr.fit(X_train_std,y_train)
print('Training Accuracy:',lr.score(X_train_std,y_train))
print('Test Accuracy:',lr.score(X_test_std,y_test))

In [None]:
lr.intercept_ # <- This is the three intercepts of our classes due to the one v rest multi_class option.
              #    From left to right, the values represent the model that fits class 1 vs class 2&3, 2 vs 1&3, and 3 vs 1&2

In [None]:
lr.coef_ # <- Returns n weight vectors for each model in lr.intercept_ that has n weights for n features in our training data (13 in this case for 13 features in the wine data)

In [None]:
import matplotlib.pyplot as plt

# Graphing the effect of L1 regularizationas we scale the regularization parameter C
fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue','green','red','cyan','magenta','yellow','black','pink','lightblue','lightgreen','gray','indigo','orange']
weights,params = [],[]

for c in np.arange(-4.,6.):
    lr = LogisticRegression(penalty='l1',C=10.**c,solver='liblinear',multi_class='ovr',random_state=0)
    lr.fit(X_train_std,y_train)
    weights.append(lr.coef_[1])
    params.append(10**c)
weights = np.array(weights)

for column,color in zip(range(weights.shape[1]),colors):
    plt.plot(params,weights[:,column],label=df_wine.columns[column+1],color=color)
plt.axhline(0,color='black',linestyle='--',linewidth=3) 
plt.xlim([10**(-5),10**5])   
plt.ylabel('Weight Coefficient')
plt.xlabel('C (Inverse Regularization Strength)')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',bbox_to_anchor=(1.38,1.03),ncol=1,fancybox=True)
plt.show()

## Sequential Feature Selection Implementation

Implementing Sequential Backward Selection, which aims to reduce K dimensional feature spaces into D dimensions. It achieves this by sequentially checking the the model's performance with the removal of each feature and selecting to remove the one feature that has the least impact on performance. It repeats this until the model has removed K-D features.

In [None]:
from sklearn.base import clone 
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class SBS:

    def __init__(self,estimator,k_features,scoring=accuracy_score,test_size=.25,random_state=1):
        self.scoring =  scoring
        self.estimator = clone(estimator)
        self.k_features =  k_features
        self.test_size = test_size
        self.random_state = random_state

    def fit(self,X,y):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=self.test_size,random_state=self.random_state)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train,y_train,X_test,y_test,self.indices_)
        self.scores_ = [score]
        while dim > self.k_features:
            scores = []
            subsets = []

            for p in combinations(self.indices_,r=dim - 1):
                score = self._calc_score(X_train,y_train,X_test,y_test,p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -=1

            self.scores_.append(scores[best])

        self.k_score_ = self.scores_[-1]
        return self

    def transform(self,X):
        return X[:, self.indices_]

    def _calc_score(self,X_train,y_train,X_test,y_test,indices):
        self.estimator.fit(X_train[:,indices],y_train)
        y_pred = self.estimator.predict(X_test[:,indices])
        score = self.scoring(y_test,y_pred)
        return score

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
sbs = SBS(knn,k_features=1)
sbs.fit(X_train_std,y_train)
k_feat = [len(k) for k in sbs.subsets_]

plt.plot(k_feat,sbs.scores_,marker='o')
plt.ylim([.7,1.02])
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
k3 = [*sbs.subsets_[10]]
print(df_wine.columns[1:][k3])

In [None]:
knn.fit(X_train_std,y_train)
print('Training Accuracy:',knn.score(X_train_std,y_train))
print('Test Accuracy',knn.score(X_test_std,y_test))

In [None]:
knn.fit(X_train_std[:,k3],y_train)
print('Training Accuracy:',knn.score(X_train_std[:,k3],y_train))
print('Test Accuracy',knn.score(X_test_std[:,k3],y_test))

#While the reduced data set performed worse, in real world applications this model would benefit from being easier to interpret and make data collection simpler as there are fewer features
#needed to be collected. It's all a balancing act of cost-benefit analysis

## Assessing Feature Importance via Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

feat_labels = df_wine.columns[1:]
forest = RandomForestClassifier(n_estimators=500,random_state=1)
forest.fit(X_train,y_train)

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],importances[indices[f]]))

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]),importances[indices],align='center')
plt.xticks(range(X_train.shape[1]),feat_labels[indices],rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.tight_layout()
plt.show()

### Demo of Sklearn's SelectFromModel object, which selects features based on a threshold we define

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest,threshold=.1,prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion: ',X_selected.shape[1])
for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],importances[indices[f]]))