# Module 7 Handon

# Module Prepare

In [18]:
import pandas as pd
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import tree
from sklearn import naive_bayes
from sklearn import model_selection
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import hstack
import multiprocessing as mp

# (1) Use the Scikit-learn library to run machine learning tasks

## Reading the dataset and then Encode the categorial variable

In [3]:
dataset = pd.read_csv('bank-full.csv',delimiter=';')
dataset = dataset.drop(['age', 'balance', 'day', 'month', 'pdays', 'duration', 'campaign', 
'previous'], axis=1)
le = {}

for col in dataset:
    if dataset[col].dtype == 'object':
        le[col] = preprocessing.LabelEncoder()
        dataset[col] = le[col].fit_transform(dataset[col])

unknown_case = pd.DataFrame.from_dict({'job':'management', 'marital':'married', 
'education':'secondary', 'default': 'no', 'housing': 'yes', 'loan': 'no', 'contact': 'cellular', 
'poutcome':'success'}, orient='index').T

for col in unknown_case:
    if unknown_case[col].dtype == 'object':
        unknown_case[col] = le[col].transform(unknown_case[col])

X = dataset.drop(['y'],axis=1)
y = dataset['y']

## The KNN example (K-Nearest Neighbors)

In [4]:
knn_model = neighbors.KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X,y)
knn_predict = knn_model.predict(unknown_case)
print(knn_predict)

[0]


## The Decision tree Example

In [5]:
dt_model = tree.DecisionTreeClassifier()
dt_model.fit(X,y)
dt_predict = dt_model.predict(unknown_case)
print(dt_predict)

[0]


## The Naive bayes Example

In [6]:
nb_model = naive_bayes.CategoricalNB()
nb_model.fit(X,y)
nb_predict = nb_model.predict(unknown_case)
print(nb_predict)

[1]


# (2) Carryout a cross-validation

## Cross-validation Example (CV)

In [7]:
X_fit, X_blindtest, y_fit, y_blindtest = model_selection.train_test_split(X, y, test_size = 0.1)

precision_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=5, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=5, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=5, scoring='f1_macro').mean()

print('CV: p:{0:.2f} r:{1:.2f} f:{2:.2f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

CV: p:0.75 r:0.58 f:0.60


## Test Example

In [8]:
dt_model.fit(X_fit,y_fit)

precision_test_score = metrics.precision_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')
recall_test_score = metrics.recall_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')
f1_test_score = metrics.f1_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')

print('test: p:{0:.2f} r:{1:.2f} f:{2:.2f}'.format(precision_test_score, recall_test_score, f1_test_score))


test: p:0.56 r:0.74 f:0.58


# (3) Undertake a bug-classification experiment (pg 71-75)

In [9]:
from pre import preprocess

In [16]:
dataset = pd.read_json('embold_train.json')
dataset.loc[dataset['label'] > 0, 'label'] = 1

with mp.Pool(processes=7) as pool:
    cleaned_title = pool.map(preprocess, dataset.title)

with mp.Pool(processes=7) as pool:
    cleaned_body = pool.map(preprocess, dataset.body)

In [38]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_title = vectorizer.fit_transform(cleaned_title)
X_body = vectorizer.fit_transform(cleaned_body)

X = hstack([X_title, X_body])
y = dataset['label']

In [41]:
dt_model = tree.DecisionTreeClassifier()

X_fit, X_blindtest, y_fit, y_blindtest = model_selection.train_test_split(X, y, test_size=0.1)

precision_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=3, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=3, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(dt_model, X_fit, y_fit, cv=3, n_jobs=-2, scoring='f1_macro').mean()

print('CV: p:{0:.2f} r:{1:.2f} f:{2:.2f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

CV: p:0.71 r:0.71 f:0.71


In [40]:
dt_model.fit(X_fit, y_fit)

precision_test_score = metrics.precision_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')
recall_test_score = metrics.recall_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')
f1_test_score = metrics.f1_score(dt_model.predict(X_blindtest), y_blindtest, average='macro')

print('test: p:{0:.2f} r:{1:.2f} f:{2:.2f}'.format(precision_test_score, recall_test_score, f1_test_score))

test: p:0.56 r:0.74 f:0.58
