### Part 1: Build a classification model using text data
In part one of the homework, you will solve a text classification task.
You can download the following data-sets from the HW data folder on the course
website:
HW4_Text_train_data.csv and HW4_text_test_data.csv
The data consists of Women’s fashion online shop reviews, consisting of a review
text, and whether the review author would recommend the product.
We are trying to determine whether a reviewer will recommend a product or not based
on each review.
In a real application this might allow us to find out what is good or bad about certain
products or to feature more typical reviews (like a very critical and a very positive one).
Use cross-validation to evaluate the results. Use a metric that’s appropriate
for imbalanced classification (AUC or average precision for example), and inspect all
models by visualizing the coefficients.

In [None]:
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import CountVectorizer

from sklearn import svm

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

In [None]:
##import data
df_test = pd.read_csv('HW4_Text_test_data.csv')
df_train = pd.read_csv('HW4_Text_train_data.csv')

In [None]:
##check the data format
print(df_train.shape)
print(df_train.head())
set(df_train.iloc[:, 1]) ##binary outcome

In [None]:
df_train_X = df_train.iloc[:, 0]
df_train_y = df_train.iloc[:, 1]

df_test_X = df_test.iloc[:, 0]
df_test_y = df_test.iloc[:, 1]

In [None]:
print(df_train_X.shape)
print(df_train_y.shape)

print(df_test_X.shape)
print(df_test_y.shape)

In [None]:
print(df_train_y.value_counts())
print(df_test_y.value_counts())

In [None]:
def visualize_coefficients(coefficients, feature_names, n_top_features=25):
    """Visualize coefficients of a linear model.
    Parameters
    ----------
    coefficients : nd-array, shape (n_features,)
        Model coefficients.
    feature_names : list or nd-array of strings, shape (n_features,)
        Feature names for labeling the coefficients.
    n_top_features : int, default=25
        How many features to show. The function will show the largest (most
        positive) and smallest (most negative)  n_top_features coefficients,
        for a total of 2 * n_top_features coefficients.
    """
    coefficients = coefficients.squeeze()
    if coefficients.ndim > 1:
        # this is not a row or column vector
        raise ValueError("coeffients must be 1d array or column vector, got"
                         " shape {}".format(coefficients.shape))
    coefficients = coefficients.ravel()

    if len(coefficients) != len(feature_names):
        raise ValueError("Number of coefficients {} doesn't match number of"
                         "feature names {}.".format(len(coefficients),
                                                    len(feature_names)))
    # get coefficients with large absolute values
    coef = coefficients.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients,
                                          positive_coefficients])
    # plot them
    plt.figure(figsize=(15, 5))
    colors = ['#ff2020' if c < 0 else '#0000aa'
              for c in coef[interesting_coefficients]]
    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
            color=colors)
    feature_names = np.array(feature_names)
    plt.subplots_adjust(bottom=0.3)
    plt.xticks(np.arange(1, 1 + 2 * n_top_features),
               feature_names[interesting_coefficients], rotation=60,
               ha="right")
    plt.ylabel("Coefficient magnitude")
    plt.xlabel("Feature")

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

### MODEL 1: countvectorize with logistic regression (not penalized)

In [None]:
###model 1: countvectorize with logistic regression (not penalized)
bow_pipeline1 = Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                                ('LR', LogisticRegression())])

param1 = {'vect__min_df':[1, 2, 3, 4, 5],
          'LR__C':[0.01, 1, 10, 100]
         }

grid1 = GridSearchCV(bow_pipeline1, param_grid=param1, cv=cv, scoring='roc_auc')

grid1.fit(df_train_X, df_train_y)

print('Model 1 GridSearchCV Best Parameters: ', grid1.best_params_)
print('Model 1 GridSearchCV Best Score:{:4f}'.format(grid1.best_score_))

In [None]:
print('Model 1 GridSearchCV Test Score:{:4f}'.format(grid1.score(df_test_X, df_test_y)))

In [None]:
y_pred1 = grid1.predict(df_test_X)
print('Model 1 Metric Using Logistic Regression and Countvectorizer:')
print(classification_report(df_test_y, y_pred1))

In [None]:
print('Model 1 Visualization:')
visualize_coefficients(grid1.best_estimator_.named_steps['LR'].coef_, grid1.best_estimator_.named_steps['vect'].get_feature_names(), n_top_features=20)

### MODEL 2: tfidf with logistic regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

bow_pipeline2 = Pipeline(steps = [('tfidf', TfidfVectorizer(stop_words='english')),
                                 ('LR', LogisticRegression())])

param2 = {'tfidf__min_df':[1, 2],
          'tfidf__norm': ['l1', 'l2', None],
         #'tfidf__ngram_range':[(1, 1), (1, 2), (1, 3)],
          'LR__C': [0.01, 0.1, 1, 10, 100]   
}

grid2 = GridSearchCV(bow_pipeline2, param_grid=param2, cv=cv, scoring = "roc_auc")

grid2.fit(df_train_X, df_train_y)

print('Model 2 GridSearchCV Best Parameters: ', grid2.best_params_)
print('Model 2 GridSearchCV Best Score:{:4f}'.format(grid2.best_score_))

In [None]:
print('Model 2 GridSearchCV Test Score:{:4f}'.format(grid2.score(df_test_X, df_test_y)))

In [None]:
#bow_pipeline.fit(df_train_X, df_train_y)
#tf_cv = cross_val_score(bow_pipeline, df_train_X, df_train_y, cv=10, scoring='f1_macro')
#print('Mean Cross-Validation Accuracy on Train Data:{: 4f}'. format(np.mean(tf_cv)))
#print('Mean Cross-Validation Accuracy on Test Data:{: 4f}'. format(np.mean(cross_val_score(bow_pipeline, df_test_X, df_test_y, cv=10, scoring='f1_macro'))))

In [None]:
y_pred2 = grid2.predict(df_test_X)
print('Model 2 Metric Using Logistic Regression and TFIDF:')
print(classification_report(df_test_y, y_pred2))

In [None]:
print('Model 2 Visualization:')
visualize_coefficients(grid2.best_estimator_.named_steps['LR'].coef_, grid2.best_estimator_.named_steps['tfidf'].get_feature_names(), n_top_features=20)

### MODEL 3: tfidf with Penalized Logistic Regression

In [None]:
bow_pipeline3 = Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                                ('LR', LogisticRegression())])

param3 = {'tfidf__min_df':[1, 2],
          'tfidf__ngram_range':[(1, 1), (1, 2), (1, 3)],
          'tfidf__norm': ['l1', 'l2', None],
          'LR__C': [0.01, 0.1, 1, 10, 100],
          'LR__penalty': ['l1', 'l2']}

grid3 = GridSearchCV(bow_pipeline3, param_grid=param3, cv=cv, scoring='roc_auc')

grid3.fit(df_train_X, df_train_y)

print('Model 3 GridSearchCV Best Parameters: ', grid3.best_params_)
print('Model 3 GridSearchCV Best Score:{:4f}'.format(grid3.best_score_))

In [None]:
print('Model 3 GridSearchCV Test Score:{:4f}'.format(grid3.score(df_test_X, df_test_y)))

In [None]:
y_pred3 = grid3.predict(df_test_X)
print('Model 3 Metric Using Logistic Regression and TFIDF:')
print(classification_report(df_test_y, y_pred3))

In [None]:
print('Model 3 Visualization:')
visualize_coefficients(grid3.best_estimator_.named_steps['LR'].coef_, grid3.best_estimator_.named_steps['tfidf'].get_feature_names(), n_top_features=20)

In [None]:
### MODEL 4: tfidf with Decision Tree

In [None]:
#from sklearn.tree import DecisionTreeClassifier

#bow_pipeline4 = Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
#                               ('DT', DecisionTreeClassifier())])

#param4 = {'tfidf__min_df':[1, 2],
#         'tfidf__ngram_range': [(1, 1), (1, 2), (2, 3)],
#         'DT__criterion': ['gini', 'entropy'],
#         'DT__max_depth': [4, 6, 8]
#}

#grid4 = GridSearchCV(bow_pipeline4, param_grid=param4, cv=5, scoring='roc_auc')

#grid4.fit(df_train_X, df_train_y)

#print('Model 4 GridSearchCV Best Parameters: ', grid4.best_params_)
#print('Model 4 GridSearchCV Best Score:{:4f}'.format(grid4.best_score_))

In [None]:
#print('Model 4 GridSearchCV Test Score:{:4f}'.format(grid4.score(df_test_X, df_test_y)))

In [None]:
#y_pred4 = grid4.predict(df_test_X)

#print('Model 4 Metric Using Decision Tree and TFIDF:')
#print(classification_report(df_test_y, y_pred24))

### Q1 Conclusion 

In [None]:
a = classification_report(df_test_y, y_pred1, output_dict=True)
b = classification_report(df_test_y, y_pred2, output_dict=True)
c = classification_report(df_test_y, y_pred3, output_dict=True)

In [None]:
train_auc = [grid1.best_score_, grid2.best_score_, grid3.best_score_]
test_auc =[grid1.score(df_test_X, df_test_y),grid2.score(df_test_X, df_test_y), grid3.score(df_test_X, df_test_y)]
test_f1_macro = [a['macro avg']['f1-score'], b['macro avg']['f1-score'], c['macro avg']['f1-score']]
test_f1_weighted = [a['weighted avg']['f1-score'], b['weighted avg']['f1-score'], c['weighted avg']['f1-score']]

In [None]:
result = {'Train_AUC': train_auc, 
          'Test_AUC': test_auc,
          'Test F1_Macro': test_f1_macro,
          'Test F1_Weighted': test_f1_weighted
         }


q1_result = pd.DataFrame(result, index=['model 1', 'model 2', 'model 3'])

In [None]:
print(round(q1_result, 4))

Conclusion: choose model 3, penalized logistic regression (L2 penalty and C=10) with tfidf (min_df =1 and ngram (1,2)).

## Part 2: Build a predictive neural network using Keras
To complete part two of the homework do the following:
Run a multilayer perceptron (feed forward neural network) with two hidden layers on
the iris dataset using the keras Sequential interface.

In [None]:
#! pip install keras

In [None]:
#! pip install tensorflow

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
##import data
df2 = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv', encoding = 'latin_1')
df2 = df2.iloc[:, 1:]

In [None]:
df2.head()

In [None]:
X = df2.iloc[:, 0:4]
y = df2.iloc[:, 4]

In [None]:
X.shape
set(y)
y.value_counts()

In [None]:
encode = LabelEncoder()
y_new = encode.fit_transform(y)
y_new = keras.utils.to_categorical(y_new)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_new, random_state=42)

In [None]:
sc = StandardScaler()
X_train_scale = sc.fit_transform(X_train)
X_train_scale= pd.DataFrame(X_train_scale, columns=X_train.columns, index=X_train.index)

In [None]:
X_test_scale = sc.transform(X_test)
X_test_scale = pd.DataFrame(X_test_scale, index=X_test.index, columns=X_test.columns)

def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df
data = create_dummies(df2,"Species")

In [None]:
def create_model(learn_rate=0.01, act='sigmoid', n =10):
    # create model
    model = Sequential()
    model.add(Dense(n, input_dim=4, activation=act))
    model.add(Dense(n, activation=act))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    optimizer = SGD(lr=learn_rate)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [None]:
base = create_model()
base.summary()

In [None]:
model = KerasClassifier(build_fn=create_model, verbose=0)

learn_rate = [0.001, 0.01]
act = ['sigmoid', 'relu', 'softmax']
n = [10, 50, 100]
epochs = [10, 20, 30]
batch = [5, 10, 50]

param = dict(learn_rate=learn_rate, act = act, n = n, epochs = epochs, batch_size = batch)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

grid = GridSearchCV(estimator=model, param_grid=param, n_jobs=-1)

nn_model = grid.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (nn_model.best_score_, nn_model.best_params_))
#means = nn_model.cv_results_['mean_test_score']
#stds = nn_model.cv_results_['std_test_score']
#params = nn_model.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
print('Neural Network Best Paramter: %s' % nn_model.best_params_)
print('Neural Network Best Score: %f' % nn_model.best_score_)

In [None]:
print('Neural Network Test Set: %f' % nn_model.score(X_test, y_test))

In [None]:
# Update Model with New Parameters
best_model = Sequential() # Best Parameter
best_model.add(Dense(10, input_dim=4, activation='relu'))
best_model.add(Dense(10, activation='relu'))
best_model.add(Dense(3, activation='softmax'))

# Compile Model
optimizer = SGD(lr=0.01) # Best Parameter
best_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Fit the Model
best_model.fit(X_train, y_train,  epochs=20, batch_size=5) # Best Parameter

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
    
test = list()
for i in range(len(y_test)):
    test.append(np.argmax(y_test[i]))

In [None]:
from sklearn.metrics import accuracy_score
test_score = accuracy_score(pred, test)
print('Neural Network Accuracy is:', round(test_score, 4))